作業部屋の使い方を試しています。
branches/b3/WebScraping をマージ
@@ -1,108 +1,71 @@ | ||
1 | 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?> |
2 | -<searchdata> | |
3 | - <url>http://weather.yahoo.co.jp/weather/</url> | |
4 | - <searchlist> | |
5 | - <item>天気01</item> | |
6 | - <htmltag>li</htmltag> | |
7 | - <htmlid/> | |
8 | - <htmlclass>point pt1400</htmlclass> | |
9 | - <around/> | |
10 | - <regexp/> | |
11 | - </searchlist> | |
12 | - <searchlist> | |
13 | - <item>天気02</item> | |
14 | - <htmltag>li</htmltag> | |
15 | - <htmlid/> | |
16 | - <htmlclass>point pt1900</htmlclass> | |
17 | - <around/> | |
18 | - <regexp/> | |
19 | - </searchlist> | |
20 | - <searchlist> | |
21 | - <item>天気03</item> | |
22 | - <htmltag>li</htmltag> | |
23 | - <htmlid/> | |
24 | - <htmlclass>point pt3410</htmlclass> | |
25 | - <around/> | |
26 | - <regexp/> | |
27 | - </searchlist> | |
28 | - <searchlist> | |
29 | - <item>天気04</item> | |
30 | - <htmltag>li</htmltag> | |
31 | - <htmlid/> | |
32 | - <htmlclass>point pt4410</htmlclass> | |
33 | - <around/> | |
34 | - <regexp/> | |
35 | - </searchlist> | |
36 | - <searchlist> | |
37 | - <item>天気05</item> | |
38 | - <htmltag>li</htmltag> | |
39 | - <htmlid/> | |
40 | - <htmlclass>point pt5110</htmlclass> | |
41 | - <around/> | |
42 | - <regexp/> | |
43 | - </searchlist> | |
44 | - <searchlist> | |
45 | - <item>天気06</item> | |
46 | - <htmltag>li</htmltag> | |
47 | - <htmlid/> | |
48 | - <htmlclass>point pt5410</htmlclass> | |
49 | - <around/> | |
50 | - <regexp/> | |
51 | - </searchlist> | |
52 | - <searchlist> | |
53 | - <item>天気07</item> | |
54 | - <htmltag>li</htmltag> | |
55 | - <htmlid/> | |
56 | - <htmlclass>point pt5610</htmlclass> | |
57 | - <around/> | |
58 | - <regexp/> | |
59 | - </searchlist> | |
60 | - <searchlist> | |
61 | - <item>天気08</item> | |
62 | - <htmltag>li</htmltag> | |
63 | - <htmlid/> | |
64 | - <htmlclass>point pt6200</htmlclass> | |
65 | - <around/> | |
66 | - <regexp/> | |
67 | - </searchlist> | |
68 | - <searchlist> | |
69 | - <item>天気09</item> | |
70 | - <htmltag>li</htmltag> | |
71 | - <htmlid/> | |
72 | - <htmlclass>point pt6710</htmlclass> | |
73 | - <around/> | |
74 | - <regexp/> | |
75 | - </searchlist> | |
76 | - <searchlist> | |
77 | - <item>天気10</item> | |
78 | - <htmltag>li</htmltag> | |
79 | - <htmlid/> | |
80 | - <htmlclass>point pt7410</htmlclass> | |
81 | - <around/> | |
82 | - <regexp/> | |
83 | - </searchlist> | |
84 | - <searchlist> | |
85 | - <item>天気11</item> | |
86 | - <htmltag>li</htmltag> | |
87 | - <htmlid/> | |
88 | - <htmlclass>point pt8210</htmlclass> | |
89 | - <around/> | |
90 | - <regexp/> | |
91 | - </searchlist> | |
92 | - <searchlist> | |
93 | - <item>天気12</item> | |
94 | - <htmltag>li</htmltag> | |
95 | - <htmlid/> | |
96 | - <htmlclass>point pt8810</htmlclass> | |
97 | - <around/> | |
98 | - <regexp/> | |
99 | - </searchlist> | |
100 | - <searchlist> | |
101 | - <item>天気13</item> | |
102 | - <htmltag>li</htmltag> | |
103 | - <htmlid/> | |
104 | - <htmlclass>point pt9110</htmlclass> | |
105 | - <around/> | |
106 | - <regexp/> | |
107 | - </searchlist> | |
108 | -</searchdata> | |
\ No newline at end of file | ||
2 | +<xmlcontainer> | |
3 | +<webscraping> | |
4 | +<url>http://weather.yahoo.co.jp/weather/</url> | |
5 | +<searchlist listNo="1"> | |
6 | +<item>天気01</item> | |
7 | +<htmltag>li</htmltag> | |
8 | +<htmlclass>point pt1400</htmlclass> | |
9 | +</searchlist> | |
10 | +<searchlist listNo="2"> | |
11 | +<item>天気02</item> | |
12 | +<htmltag>li</htmltag> | |
13 | +<htmlclass>point pt1900</htmlclass> | |
14 | +</searchlist> | |
15 | +<searchlist listNo="3"> | |
16 | +<item>天気03</item> | |
17 | +<htmltag>li</htmltag> | |
18 | +<htmlclass>point pt3410</htmlclass> | |
19 | +</searchlist> | |
20 | +<searchlist listNo="4"> | |
21 | +<item>天気04</item> | |
22 | +<htmltag>li</htmltag> | |
23 | +<htmlclass>point pt4410</htmlclass> | |
24 | +</searchlist> | |
25 | +<searchlist listNo="5"> | |
26 | +<item>天気05</item> | |
27 | +<htmltag>li</htmltag> | |
28 | +<htmlclass>point pt5110</htmlclass> | |
29 | +</searchlist> | |
30 | +<searchlist listNo="6"> | |
31 | +<item>天気06</item> | |
32 | +<htmltag>li</htmltag> | |
33 | +<htmlclass>point pt5410</htmlclass> | |
34 | +</searchlist> | |
35 | +<searchlist listNo="7"> | |
36 | +<item>天気07</item> | |
37 | +<htmltag>li</htmltag> | |
38 | +<htmlclass>point pt5610</htmlclass> | |
39 | +</searchlist> | |
40 | +<searchlist listNo="8"> | |
41 | +<item>天気08</item> | |
42 | +<htmltag>li</htmltag> | |
43 | +<htmlclass>point pt6200</htmlclass> | |
44 | +</searchlist> | |
45 | +<searchlist listNo="9"> | |
46 | +<item>天気09</item> | |
47 | +<htmltag>li</htmltag> | |
48 | +<htmlclass>point pt6710</htmlclass> | |
49 | +</searchlist> | |
50 | +<searchlist listNo="10"> | |
51 | +<item>天気10</item> | |
52 | +<htmltag>li</htmltag> | |
53 | +<htmlclass>point pt7410</htmlclass> | |
54 | +</searchlist> | |
55 | +<searchlist listNo="11"> | |
56 | +<item>天気11</item> | |
57 | +<htmltag>li</htmltag> | |
58 | +<htmlclass>point pt8210</htmlclass> | |
59 | +</searchlist> | |
60 | +<searchlist listNo="12"> | |
61 | +<item>天気12</item> | |
62 | +<htmltag>li</htmltag> | |
63 | +<htmlclass>point pt8810</htmlclass> | |
64 | +</searchlist> | |
65 | +<searchlist listNo="13"> | |
66 | +<item>天気13</item> | |
67 | +<htmltag>li</htmltag> | |
68 | +<htmlclass>point pt9110</htmlclass> | |
69 | +</searchlist> | |
70 | +</webscraping> | |
71 | +</xmlcontainer> |
@@ -1 +1,217 @@ | ||
1 | -<?xml version="1.0" encoding="UTF-8" standalone="no"?><searchdata><url>http://stocks.finance.yahoo.co.jp/stocks/detail/?code=9984.T</url><searchlist><item>銘柄コード</item><htmltag>dl</htmltag><htmlid/><htmlclass>stocksInfo clearFix</htmlclass><around/><regexp>(^\d{4})</regexp></searchlist><searchlist><item>カテゴリ</item><htmltag>div</htmltag><htmlid/><htmlclass>stockMainTabParts stockMainTabPartsCurrent</htmlclass><around/><regexp/></searchlist><searchlist><item>業種</item><htmltag>dd</htmltag><htmlid/><htmlclass>category yjSb</htmlclass><around/><regexp/></searchlist><searchlist><item>取得時間</item><htmltag>dd</htmltag><htmlid/><htmlclass>yjSb real</htmlclass><around/><regexp>^(.*)\t</regexp></searchlist><searchlist><item>銘柄名</item><htmltag>th</htmltag><htmlid/><htmlclass>symbol</htmlclass><around/><regexp/></searchlist><searchlist><item>株価</item><htmltag>td</htmltag><htmlid/><htmlclass>stoksPrice</htmlclass><around/><regexp/></searchlist><searchlist><item>前日比</item><htmltag>td</htmltag><htmlid/><htmlclass>change</htmlclass><around/><regexp>\t(.*)(.*%)</regexp></searchlist><searchlist><item>前日比%</item><htmltag>td</htmltag><htmlid/><htmlclass>change</htmlclass><around/><regexp>\t.*((.*)%)</regexp></searchlist><searchlist><item>前日終値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>0</around><regexp>^([,0-9]+)\t</regexp></searchlist><searchlist><item>始値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>1</around><regexp>^([,0-9]+|-{3})\t</regexp></searchlist><searchlist><item>高値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>2</around><regexp>^([,0-9]+|-{3})\t</regexp></searchlist><searchlist><item>安値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>3</around><regexp>^([,0-9]+|-{3})\t</regexp></searchlist><searchlist><item>出来高</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>4</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>売買代金</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>5</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>値幅制限</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi clearfix</htmlclass><around>6</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>時価総額</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>0</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>発行済株式数</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>1</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>配当利回り</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>2</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>1株配当</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>3</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>PER</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>4</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>PBR</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>5</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>EPS</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>6</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>BPS</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>7</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>最低購入代金</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>8</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>単元株数</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>9</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>年初来高値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>10</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>年初来安値</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>11</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>信用買残</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>12</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>信用買残前週比</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>13</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>信用売残</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>14</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>信用売残前週比</item><htmltag>div</htmltag><htmlid/><htmlclass>lineFi yjMS clearfix</htmlclass><around>15</around><regexp>^(.*?)\t</regexp></searchlist><searchlist><item>貸借倍率</item><htmltag>div</htmltag><htmlid/><htmlclass>yjMS clearfix</htmlclass><around/><regexp>^(.*?)\t</regexp></searchlist></searchdata> | |
\ No newline at end of file | ||
1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
2 | +<xmlcontainer> | |
3 | +<webscraping> | |
4 | +<url>http://stocks.finance.yahoo.co.jp/stocks/detail/?code=5020.T</url> | |
5 | +<searchlist listNo="1"> | |
6 | +<item>銘柄コード</item> | |
7 | +<htmltag>dl</htmltag> | |
8 | +<htmlclass>stocksInfo clearFix</htmlclass> | |
9 | +<regexp>(^\d{4})</regexp> | |
10 | +</searchlist> | |
11 | +<searchlist listNo="2"> | |
12 | +<item>カテゴリ</item> | |
13 | +<htmltag>div</htmltag> | |
14 | +<htmlclass>stockMainTabParts stockMainTabPartsCurrent</htmlclass> | |
15 | +</searchlist> | |
16 | +<searchlist listNo="3"> | |
17 | +<item>業種</item> | |
18 | +<htmltag>dd</htmltag> | |
19 | +<htmlclass>category yjSb</htmlclass> | |
20 | +</searchlist> | |
21 | +<searchlist listNo="4"> | |
22 | +<item>取得時間</item> | |
23 | +<htmltag>dd</htmltag> | |
24 | +<htmlclass>yjSb real</htmlclass> | |
25 | +<regexp>^(.*)\t</regexp> | |
26 | +</searchlist> | |
27 | +<searchlist listNo="5"> | |
28 | +<item>銘柄名</item> | |
29 | +<htmltag>th</htmltag> | |
30 | +<htmlclass>symbol</htmlclass> | |
31 | +</searchlist> | |
32 | +<searchlist listNo="6"> | |
33 | +<item>株価</item> | |
34 | +<htmltag>td</htmltag> | |
35 | +<htmlclass>stoksPrice</htmlclass> | |
36 | +</searchlist> | |
37 | +<searchlist listNo="7"> | |
38 | +<item>前日比</item> | |
39 | +<htmltag>td</htmltag> | |
40 | +<htmlclass>change</htmlclass> | |
41 | +<regexp>\t(.*)(.*%)</regexp> | |
42 | +</searchlist> | |
43 | +<searchlist listNo="8"> | |
44 | +<item>前日比%</item> | |
45 | +<htmltag>td</htmltag> | |
46 | +<htmlclass>change</htmlclass> | |
47 | +<regexp>\t.*((.*)%)</regexp> | |
48 | +</searchlist> | |
49 | +<searchlist listNo="9"> | |
50 | +<item>前日終値</item> | |
51 | +<htmltag>div</htmltag> | |
52 | +<htmlclass>lineFi clearfix</htmlclass> | |
53 | +<around>0</around> | |
54 | +<regexp>^([,.0-9]+)\t</regexp> | |
55 | +</searchlist> | |
56 | +<searchlist listNo="10"> | |
57 | +<item>始値</item> | |
58 | +<htmltag>div</htmltag> | |
59 | +<htmlclass>lineFi clearfix</htmlclass> | |
60 | +<around>1</around> | |
61 | +<regexp>^([,.0-9]+|-{3})\t</regexp> | |
62 | +</searchlist> | |
63 | +<searchlist listNo="11"> | |
64 | +<item>高値</item> | |
65 | +<htmltag>div</htmltag> | |
66 | +<htmlclass>lineFi clearfix</htmlclass> | |
67 | +<around>2</around> | |
68 | +<regexp>^((ストップ高\t|ストップ安\t)?[,0-9]+|-{3})</regexp> | |
69 | +</searchlist> | |
70 | +<searchlist listNo="12"> | |
71 | +<item>安値</item> | |
72 | +<htmltag>div</htmltag> | |
73 | +<htmlclass>lineFi clearfix</htmlclass> | |
74 | +<around>3</around> | |
75 | +<regexp>^((ストップ高\t|ストップ安\t)?[,0-9]+|-{3})</regexp> | |
76 | +</searchlist> | |
77 | +<searchlist listNo="13"> | |
78 | +<item>出来高</item> | |
79 | +<htmltag>div</htmltag> | |
80 | +<htmlclass>lineFi clearfix</htmlclass> | |
81 | +<around>4</around> | |
82 | +<regexp>^(.*?)\t</regexp> | |
83 | +</searchlist> | |
84 | +<searchlist listNo="14"> | |
85 | +<item>売買代金</item> | |
86 | +<htmltag>div</htmltag> | |
87 | +<htmlclass>lineFi clearfix</htmlclass> | |
88 | +<around>5</around> | |
89 | +<regexp>^(.*?)\t</regexp> | |
90 | +</searchlist> | |
91 | +<searchlist listNo="15"> | |
92 | +<item>値幅制限</item> | |
93 | +<htmltag>div</htmltag> | |
94 | +<htmlclass>lineFi clearfix</htmlclass> | |
95 | +<around>6</around> | |
96 | +<regexp>^(.*?)\t</regexp> | |
97 | +</searchlist> | |
98 | +<searchlist listNo="16"> | |
99 | +<item>時価総額</item> | |
100 | +<htmltag>div</htmltag> | |
101 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
102 | +<around>0</around> | |
103 | +<regexp>^(.*?)\t</regexp> | |
104 | +</searchlist> | |
105 | +<searchlist listNo="17"> | |
106 | +<item>発行済株式数</item> | |
107 | +<htmltag>div</htmltag> | |
108 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
109 | +<around>1</around> | |
110 | +<regexp>^(.*?)\t</regexp> | |
111 | +</searchlist> | |
112 | +<searchlist listNo="18"> | |
113 | +<item>配当利回り</item> | |
114 | +<htmltag>div</htmltag> | |
115 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
116 | +<around>2</around> | |
117 | +<regexp>^(.*?)\t</regexp> | |
118 | +</searchlist> | |
119 | +<searchlist listNo="19"> | |
120 | +<item>1株配当</item> | |
121 | +<htmltag>div</htmltag> | |
122 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
123 | +<around>3</around> | |
124 | +<regexp>^(.*?)\t</regexp> | |
125 | +</searchlist> | |
126 | +<searchlist listNo="20"> | |
127 | +<item>PER</item> | |
128 | +<htmltag>div</htmltag> | |
129 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
130 | +<around>4</around> | |
131 | +<regexp>^(.*?)\t</regexp> | |
132 | +</searchlist> | |
133 | +<searchlist listNo="21"> | |
134 | +<item>PBR</item> | |
135 | +<htmltag>div</htmltag> | |
136 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
137 | +<around>5</around> | |
138 | +<regexp>^(.*?)\t</regexp> | |
139 | +</searchlist> | |
140 | +<searchlist listNo="22"> | |
141 | +<item>EPS</item> | |
142 | +<htmltag>div</htmltag> | |
143 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
144 | +<around>6</around> | |
145 | +<regexp>^(.*?)\t</regexp> | |
146 | +</searchlist> | |
147 | +<searchlist listNo="23"> | |
148 | +<item>BPS</item> | |
149 | +<htmltag>div</htmltag> | |
150 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
151 | +<around>7</around> | |
152 | +<regexp>^(.*?)\t</regexp> | |
153 | +</searchlist> | |
154 | +<searchlist listNo="24"> | |
155 | +<item>最低購入代金</item> | |
156 | +<htmltag>div</htmltag> | |
157 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
158 | +<around>8</around> | |
159 | +<regexp>^(.*?)\t</regexp> | |
160 | +</searchlist> | |
161 | +<searchlist listNo="25"> | |
162 | +<item>単元株数</item> | |
163 | +<htmltag>div</htmltag> | |
164 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
165 | +<around>9</around> | |
166 | +<regexp>^(.*?)\t</regexp> | |
167 | +</searchlist> | |
168 | +<searchlist listNo="26"> | |
169 | +<item>年初来高値</item> | |
170 | +<htmltag>div</htmltag> | |
171 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
172 | +<around>10</around> | |
173 | +<regexp>^(.*?)\t</regexp> | |
174 | +</searchlist> | |
175 | +<searchlist listNo="27"> | |
176 | +<item>年初来安値</item> | |
177 | +<htmltag>div</htmltag> | |
178 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
179 | +<around>11</around> | |
180 | +<regexp>^(.*?)\t</regexp> | |
181 | +</searchlist> | |
182 | +<searchlist listNo="28"> | |
183 | +<item>信用買残</item> | |
184 | +<htmltag>div</htmltag> | |
185 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
186 | +<around>12</around> | |
187 | +<regexp>^(.*?)\t</regexp> | |
188 | +</searchlist> | |
189 | +<searchlist listNo="29"> | |
190 | +<item>信用買残前週比</item> | |
191 | +<htmltag>div</htmltag> | |
192 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
193 | +<around>13</around> | |
194 | +<regexp>^(.*?)\t</regexp> | |
195 | +</searchlist> | |
196 | +<searchlist listNo="30"> | |
197 | +<item>信用売残</item> | |
198 | +<htmltag>div</htmltag> | |
199 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
200 | +<around>14</around> | |
201 | +<regexp>^(.*?)\t</regexp> | |
202 | +</searchlist> | |
203 | +<searchlist listNo="31"> | |
204 | +<item>信用売残前週比</item> | |
205 | +<htmltag>div</htmltag> | |
206 | +<htmlclass>lineFi yjMS clearfix</htmlclass> | |
207 | +<around>15</around> | |
208 | +<regexp>^(.*?)\t</regexp> | |
209 | +</searchlist> | |
210 | +<searchlist listNo="32"> | |
211 | +<item>貸借倍率</item> | |
212 | +<htmltag>div</htmltag> | |
213 | +<htmlclass>yjMS clearfix</htmlclass> | |
214 | +<regexp>^(.*?)\t</regexp> | |
215 | +</searchlist> | |
216 | +</webscraping> | |
217 | +</xmlcontainer> |
@@ -0,0 +1,314 @@ | ||
1 | +/* | |
2 | + * Copyright (C) 2014 kgto. | |
3 | + * | |
4 | + * This library is free software; you can redistribute it and/or | |
5 | + * modify it under the terms of the GNU Lesser General Public | |
6 | + * License as published by the Free Software Foundation; either | |
7 | + * version 2.1 of the License, or (at your option) any later version. | |
8 | + * | |
9 | + * This library is distributed in the hope that it will be useful, | |
10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 | + * Lesser General Public License for more details. | |
13 | + * | |
14 | + * You should have received a copy of the GNU Lesser General Public | |
15 | + * License along with this library; if not, write to the Free Software | |
16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
17 | + * MA 02110-1301 USA | |
18 | + */ | |
19 | +/* | |
20 | + * $Id$ | |
21 | + */ | |
22 | + | |
23 | +package utility.test1; | |
24 | + | |
25 | +import webScraping.core.SearchData; | |
26 | +import java.io.File; | |
27 | +import java.io.FileNotFoundException; | |
28 | +import java.io.FileOutputStream; | |
29 | +import java.io.IOException; | |
30 | +import java.util.logging.Level; | |
31 | +import java.util.logging.Logger; | |
32 | +import javax.xml.parsers.DocumentBuilder; | |
33 | +import javax.xml.parsers.DocumentBuilderFactory; | |
34 | +import javax.xml.parsers.ParserConfigurationException; | |
35 | +import javax.xml.transform.Transformer; | |
36 | +import javax.xml.transform.TransformerConfigurationException; | |
37 | +import javax.xml.transform.TransformerException; | |
38 | +import javax.xml.transform.TransformerFactory; | |
39 | +import javax.xml.transform.dom.DOMSource; | |
40 | +import javax.xml.transform.stream.StreamResult; | |
41 | +import org.w3c.dom.DOMImplementation; | |
42 | +import org.w3c.dom.Document; | |
43 | +import org.w3c.dom.Element; | |
44 | +import org.w3c.dom.Node; | |
45 | +import org.w3c.dom.NodeList; | |
46 | +import org.xml.sax.SAXException; | |
47 | + | |
48 | +/** | |
49 | + * 検索データ読込・保存. | |
50 | + * @author kgto | |
51 | + */ | |
52 | +public class SearchDataRW { | |
53 | + /* ---------------------------------------------------------------------- * | |
54 | + * フィールド | |
55 | + * ---------------------------------------------------------------------- */ | |
56 | + private String UrlAdress; | |
57 | + | |
58 | + DocumentBuilder builder; | |
59 | + public Document document; | |
60 | + Element root; | |
61 | + | |
62 | + /* ---------------------------------------------------------------------- * | |
63 | + * コンストラクタ | |
64 | + * ---------------------------------------------------------------------- */ | |
65 | + public SearchDataRW() { | |
66 | + try { | |
67 | + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
68 | + builder = factory.newDocumentBuilder(); | |
69 | + | |
70 | + } catch (ParserConfigurationException ex) { | |
71 | + Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
72 | + } | |
73 | + } | |
74 | + | |
75 | + /* ---------------------------------------------------------------------- * | |
76 | + * Setter | |
77 | + * ---------------------------------------------------------------------- */ | |
78 | + public void seturl(String UrlAdress) { | |
79 | + this.UrlAdress = UrlAdress; | |
80 | + } | |
81 | + | |
82 | + /* ---------------------------------------------------------------------- * | |
83 | + * Getter | |
84 | + * ---------------------------------------------------------------------- */ | |
85 | + public String geturl() { | |
86 | + return UrlAdress; | |
87 | + } | |
88 | + | |
89 | + /* ---------------------------------------------------------------------- * | |
90 | + * メソッド | |
91 | + * ---------------------------------------------------------------------- */ | |
92 | + /** | |
93 | + * 保存. | |
94 | + * @param file | |
95 | + */ | |
96 | + public void save(File file) { | |
97 | + saveUrl(UrlAdress); | |
98 | + saveSearchList(); | |
99 | + write(file); | |
100 | + } | |
101 | + | |
102 | + /** | |
103 | + * 読込. | |
104 | + * @param file | |
105 | + */ | |
106 | + public void load(File file) { | |
107 | + read(file); | |
108 | + loadUrl(); | |
109 | + loadSearchList(); | |
110 | + } | |
111 | + | |
112 | + /* ---------------------------------------------------------------------- */ | |
113 | + | |
114 | + void loadUrl() { | |
115 | + NodeList nodelist = root.getElementsByTagName("url"); | |
116 | + Node node = nodelist.item(0); | |
117 | + UrlAdress = node.getFirstChild().getNodeValue(); | |
118 | + } | |
119 | + | |
120 | + public void loadSearchList() { | |
121 | + SearchData.clear(); | |
122 | + | |
123 | + NodeList nodelist = root.getElementsByTagName("searchlist"); | |
124 | + for(int i = 0; i < nodelist.getLength(); i++) { | |
125 | + Node childnode = nodelist.item(i); | |
126 | + | |
127 | + boolean sdatflg = false; | |
128 | + SearchData sdat = new SearchData(); | |
129 | + for (Node child = childnode.getFirstChild(); child != null; child = child.getNextSibling()) { | |
130 | + if(child.getNodeType() == Node.ELEMENT_NODE) { | |
131 | + String tag = child.getNodeName(); | |
132 | + String rtn = ""; | |
133 | + if(child.getFirstChild() != null) { | |
134 | + rtn = child.getFirstChild().getNodeValue(); | |
135 | + } | |
136 | + switch (tag) { | |
137 | + case "item" : | |
138 | + sdat.setitem(rtn); | |
139 | + sdatflg = true; | |
140 | + break; | |
141 | + case "htmltag" : | |
142 | + sdat.setHtmltag(rtn); | |
143 | + sdatflg = true; | |
144 | + break; | |
145 | + case "htmlid" : | |
146 | + sdat.setHtmlid(rtn); | |
147 | + sdatflg = true; | |
148 | + break; | |
149 | + case "htmlclass" : | |
150 | + sdat.setHtmlclass(rtn); | |
151 | + sdatflg = true; | |
152 | + break; | |
153 | + case "around" : | |
154 | + sdat.setaround(rtn); | |
155 | + sdatflg = true; | |
156 | + break; | |
157 | + case "regexp" : | |
158 | + sdat.setregexp(rtn); | |
159 | + sdatflg = true; | |
160 | + break; | |
161 | + } | |
162 | + } | |
163 | + } | |
164 | + if(sdatflg) SearchData.add(sdat); | |
165 | + } | |
166 | + } | |
167 | + | |
168 | + public String loadMsg404() { | |
169 | + StringBuilder strbuf = new StringBuilder(); | |
170 | + NodeList nodelist = root.getElementsByTagName("msg404"); | |
171 | + for(int i = 0; i < nodelist.getLength(); i++) { | |
172 | + Node childnode = nodelist.item(i); | |
173 | + String str = childnode.getFirstChild().getNodeValue(); | |
174 | + if(strbuf.length() > 0) { | |
175 | + strbuf.append("\n"); | |
176 | + } | |
177 | + strbuf.append(str); | |
178 | + } | |
179 | + return strbuf.toString(); | |
180 | + } | |
181 | + | |
182 | + public Element loadElement(String elementTagName) { | |
183 | + NodeList nodelist = root.getElementsByTagName(elementTagName); | |
184 | + Element element = (Element)nodelist.item(0); | |
185 | + | |
186 | + return element; | |
187 | + } | |
188 | + | |
189 | + /* ---------------------------------------------------------------------- */ | |
190 | + | |
191 | + void saveUrl(String urladdress) { | |
192 | + checkdoc(); | |
193 | + removeElement("url"); // 既にElementが存在してた場合、一度削除 | |
194 | + | |
195 | + Element url = document.createElement("url"); | |
196 | + url.appendChild(document.createTextNode(urladdress)); | |
197 | + root.appendChild(url); | |
198 | + } | |
199 | + | |
200 | + void saveSearchList() { | |
201 | + checkdoc(); | |
202 | + removeElement("searchlist"); // 既にElementが存在してた場合、一度削除 | |
203 | + | |
204 | + int count = 0; | |
205 | + for(int i = 0; i < SearchData.size(); i++) { | |
206 | + SearchData sdat = SearchData.get(i); | |
207 | + | |
208 | + Element cslist = document.createElement("searchlist"); | |
209 | + cslist.setAttribute("listNo", String.valueOf(++count)); | |
210 | + | |
211 | + addChild(cslist, "item", sdat.getitem()); | |
212 | + addChild(cslist, "htmltag", sdat.getHtmltag()); | |
213 | + addChild(cslist, "htmlid", sdat.getHtmlid()); | |
214 | + addChild(cslist, "htmlclass", sdat.getHtmlclass()); | |
215 | + addChild(cslist, "around", sdat.getaround()); | |
216 | + addChild(cslist, "regexp", sdat.getregexp()); | |
217 | + | |
218 | + root.appendChild(cslist); | |
219 | + } | |
220 | + } | |
221 | + | |
222 | + void saveMsg404(String msg) { | |
223 | + checkdoc(); | |
224 | + removeElement("msg404"); // 既にElementが存在してた場合、一度削除 | |
225 | + | |
226 | + String[] msgs = msg.split("\n"); | |
227 | + int count = 0; | |
228 | + for(String msgOne : msgs) { | |
229 | + Element msgElement = document.createElement("msg404"); | |
230 | + msgElement.setAttribute("No", String.valueOf(++count)); | |
231 | + msgElement.appendChild(document.createTextNode(msgOne)); | |
232 | + | |
233 | + root.appendChild(msgElement); | |
234 | + } | |
235 | + } | |
236 | + | |
237 | + public void saveElement(Element element) { | |
238 | + checkdoc(); | |
239 | + removeElement(element.getTagName()); // 既にElementが存在してた場合、一度削除 | |
240 | + | |
241 | + root.appendChild(element); | |
242 | + } | |
243 | + | |
244 | + /* ---------------------------------------------------------------------- */ | |
245 | + | |
246 | + private void addChild(Element cslist, String keyword, String data) { | |
247 | + if(!data.isEmpty()) { | |
248 | + Element element = document.createElement(keyword); | |
249 | + element.appendChild(document.createTextNode(data)); | |
250 | + cslist.appendChild(element); | |
251 | + } | |
252 | + } | |
253 | + | |
254 | + private void removeElement(String elementTagName) { | |
255 | + int nodeSize; | |
256 | + do { | |
257 | + NodeList nodelist = document.getElementsByTagName(elementTagName); | |
258 | + nodeSize = nodelist.getLength(); | |
259 | + for(int i = 0; i < nodelist.getLength(); i++) { | |
260 | + Node node = nodelist.item(i); | |
261 | + root.removeChild(node); | |
262 | + } | |
263 | + } while(nodeSize > 0); | |
264 | + } | |
265 | + | |
266 | + /** | |
267 | + * ドキュメントチェック. | |
268 | + * 新規の場合やXMLファイルの読込みが行われていない状態時、新たにルートエレメントを作成する。 | |
269 | + * 既読の場合、ルートエレメントの取得を行う。 | |
270 | + */ | |
271 | + public void checkdoc() { | |
272 | + if(document == null) { | |
273 | + DOMImplementation domImpl = builder.getDOMImplementation(); | |
274 | + document = domImpl.createDocument("","searchdata",null); | |
275 | + } | |
276 | + root = document.getDocumentElement(); | |
277 | + } | |
278 | + | |
279 | + /** | |
280 | + * XML読込み. | |
281 | + * @param file | |
282 | + */ | |
283 | + public void read(File file) { | |
284 | + try { | |
285 | + document = builder.parse(file); | |
286 | + root = document.getDocumentElement(); | |
287 | + | |
288 | + } catch (SAXException | IOException ex) { | |
289 | + Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
290 | + } | |
291 | + } | |
292 | + | |
293 | + /** | |
294 | + * XML書込み. | |
295 | + * @param file | |
296 | + */ | |
297 | + public void write(File file) { | |
298 | + try { | |
299 | + TransformerFactory transFactory = TransformerFactory.newInstance(); | |
300 | + Transformer transformer = transFactory.newTransformer(); | |
301 | + | |
302 | + DOMSource source = new DOMSource(document); | |
303 | + FileOutputStream os = new FileOutputStream(file); | |
304 | + StreamResult result = new StreamResult(os); | |
305 | + transformer.transform(source, result); | |
306 | + | |
307 | + } catch (TransformerConfigurationException ex) { | |
308 | + Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
309 | + } catch (FileNotFoundException | TransformerException ex) { | |
310 | + Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
311 | + } | |
312 | + } | |
313 | + | |
314 | +} |
@@ -1,7 +1,6 @@ | ||
1 | 1 | |
2 | 2 | package utility.test1; |
3 | 3 | |
4 | -import webScraping.utility.SearchDataRW; | |
5 | 4 | import java.io.File; |
6 | 5 | import java.lang.reflect.InvocationTargetException; |
7 | 6 | import java.lang.reflect.Method; |
@@ -1,7 +1,6 @@ | ||
1 | 1 | |
2 | 2 | package utility.test1; |
3 | 3 | |
4 | -import webScraping.utility.SearchDataRW; | |
5 | 4 | import java.io.File; |
6 | 5 | import webScraping.core.SearchData; |
7 | 6 |
@@ -0,0 +1,42 @@ | ||
1 | + | |
2 | +package utility.test1; | |
3 | + | |
4 | +import java.io.File; | |
5 | +import webScraping.utility.ScrapingXml; | |
6 | + | |
7 | +/** | |
8 | + * XMLコンバータ | |
9 | + * 旧:SearchDataRW.java → 新:ScrapingXml.java | |
10 | + * @author kgto | |
11 | + */ | |
12 | +public class ConvertXml01 { | |
13 | + | |
14 | + private String UrlAdress; | |
15 | + File file = new File("test1.xml"); | |
16 | + | |
17 | + /** | |
18 | + * @param args the command line arguments | |
19 | + */ | |
20 | + public static void main(String[] args) { | |
21 | + ConvertXml01 conv = new ConvertXml01(); | |
22 | + | |
23 | + conv.readold(); | |
24 | + conv.writenew(); | |
25 | + | |
26 | + System.exit(0); | |
27 | + } | |
28 | + | |
29 | + void readold() { | |
30 | + SearchDataRW sdatrw = new SearchDataRW(); | |
31 | + sdatrw.load(file); | |
32 | + UrlAdress = sdatrw.geturl(); | |
33 | + } | |
34 | + | |
35 | + void writenew() { | |
36 | + ScrapingXml xmlwriter = new ScrapingXml(); | |
37 | + xmlwriter.setTestUrl(UrlAdress); | |
38 | + xmlwriter.setSdata(); | |
39 | + xmlwriter.save(file); | |
40 | + } | |
41 | + | |
42 | +} |
@@ -1,16 +1,71 @@ | ||
1 | -<?xml version="1.0" encoding="UTF-8" standalone="no"?><searchdata> | |
2 | - | |
3 | - | |
4 | - | |
5 | - | |
6 | - | |
7 | - | |
8 | - | |
9 | - | |
10 | - | |
11 | - | |
12 | - | |
13 | - | |
14 | - | |
15 | - | |
16 | -<url>http://weather.yahoo.co.jp/weather/</url><searchlist listNo="1"><item>天気01</item><htmltag>li</htmltag><htmlclass>point pt1400</htmlclass></searchlist><searchlist listNo="2"><item>天気02</item><htmltag>li</htmltag><htmlclass>point pt1900</htmlclass></searchlist><searchlist listNo="3"><item>天気03</item><htmltag>li</htmltag><htmlclass>point pt3410</htmlclass></searchlist><searchlist listNo="4"><item>天気04</item><htmltag>li</htmltag><htmlclass>point pt4410</htmlclass></searchlist><searchlist listNo="5"><item>天気05</item><htmltag>li</htmltag><htmlclass>point pt5110</htmlclass></searchlist><searchlist listNo="6"><item>天気06</item><htmltag>li</htmltag><htmlclass>point pt5410</htmlclass></searchlist><searchlist listNo="7"><item>天気07</item><htmltag>li</htmltag><htmlclass>point pt5610</htmlclass></searchlist><searchlist listNo="8"><item>天気08</item><htmltag>li</htmltag><htmlclass>point pt6200</htmlclass></searchlist><searchlist listNo="9"><item>天気09</item><htmltag>li</htmltag><htmlclass>point pt6710</htmlclass></searchlist><searchlist listNo="10"><item>天気10</item><htmltag>li</htmltag><htmlclass>point pt7410</htmlclass></searchlist><searchlist listNo="11"><item>天気11</item><htmltag>li</htmltag><htmlclass>point pt8210</htmlclass></searchlist><searchlist listNo="12"><item>天気12</item><htmltag>li</htmltag><htmlclass>point pt8810</htmlclass></searchlist><searchlist listNo="13"><item>天気13</item><htmltag>li</htmltag><htmlclass>point pt9110</htmlclass></searchlist></searchdata> | |
\ No newline at end of file | ||
1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
2 | +<xmlcontainer> | |
3 | +<webscraping> | |
4 | +<url>http://weather.yahoo.co.jp/weather/</url> | |
5 | +<searchlist listNo="1"> | |
6 | +<item>天気01</item> | |
7 | +<htmltag>li</htmltag> | |
8 | +<htmlclass>point pt1400</htmlclass> | |
9 | +</searchlist> | |
10 | +<searchlist listNo="2"> | |
11 | +<item>天気02</item> | |
12 | +<htmltag>li</htmltag> | |
13 | +<htmlclass>point pt1900</htmlclass> | |
14 | +</searchlist> | |
15 | +<searchlist listNo="3"> | |
16 | +<item>天気03</item> | |
17 | +<htmltag>li</htmltag> | |
18 | +<htmlclass>point pt3410</htmlclass> | |
19 | +</searchlist> | |
20 | +<searchlist listNo="4"> | |
21 | +<item>天気04</item> | |
22 | +<htmltag>li</htmltag> | |
23 | +<htmlclass>point pt4410</htmlclass> | |
24 | +</searchlist> | |
25 | +<searchlist listNo="5"> | |
26 | +<item>天気05</item> | |
27 | +<htmltag>li</htmltag> | |
28 | +<htmlclass>point pt5110</htmlclass> | |
29 | +</searchlist> | |
30 | +<searchlist listNo="6"> | |
31 | +<item>天気06</item> | |
32 | +<htmltag>li</htmltag> | |
33 | +<htmlclass>point pt5410</htmlclass> | |
34 | +</searchlist> | |
35 | +<searchlist listNo="7"> | |
36 | +<item>天気07</item> | |
37 | +<htmltag>li</htmltag> | |
38 | +<htmlclass>point pt5610</htmlclass> | |
39 | +</searchlist> | |
40 | +<searchlist listNo="8"> | |
41 | +<item>天気08</item> | |
42 | +<htmltag>li</htmltag> | |
43 | +<htmlclass>point pt6200</htmlclass> | |
44 | +</searchlist> | |
45 | +<searchlist listNo="9"> | |
46 | +<item>天気09</item> | |
47 | +<htmltag>li</htmltag> | |
48 | +<htmlclass>point pt6710</htmlclass> | |
49 | +</searchlist> | |
50 | +<searchlist listNo="10"> | |
51 | +<item>天気10</item> | |
52 | +<htmltag>li</htmltag> | |
53 | +<htmlclass>point pt7410</htmlclass> | |
54 | +</searchlist> | |
55 | +<searchlist listNo="11"> | |
56 | +<item>天気11</item> | |
57 | +<htmltag>li</htmltag> | |
58 | +<htmlclass>point pt8210</htmlclass> | |
59 | +</searchlist> | |
60 | +<searchlist listNo="12"> | |
61 | +<item>天気12</item> | |
62 | +<htmltag>li</htmltag> | |
63 | +<htmlclass>point pt8810</htmlclass> | |
64 | +</searchlist> | |
65 | +<searchlist listNo="13"> | |
66 | +<item>天気13</item> | |
67 | +<htmltag>li</htmltag> | |
68 | +<htmlclass>point pt9110</htmlclass> | |
69 | +</searchlist> | |
70 | +</webscraping> | |
71 | +</xmlcontainer> |
@@ -1,547 +0,0 @@ | ||
1 | -/* | |
2 | - * Copyright (C) 2014 kgto. | |
3 | - * | |
4 | - * This library is free software; you can redistribute it and/or | |
5 | - * modify it under the terms of the GNU Lesser General Public | |
6 | - * License as published by the Free Software Foundation; either | |
7 | - * version 2.1 of the License, or (at your option) any later version. | |
8 | - * | |
9 | - * This library is distributed in the hope that it will be useful, | |
10 | - * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 | - * Lesser General Public License for more details. | |
13 | - * | |
14 | - * You should have received a copy of the GNU Lesser General Public | |
15 | - * License along with this library; if not, write to the Free Software | |
16 | - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
17 | - * MA 02110-1301 USA | |
18 | - */ | |
19 | -/* | |
20 | - * $Id$ | |
21 | - */ | |
22 | - | |
23 | -package webScraping.utility; | |
24 | - | |
25 | -import webScraping.core.SearchData; | |
26 | -import java.io.BufferedReader; | |
27 | -import java.io.BufferedWriter; | |
28 | -import java.io.File; | |
29 | -import java.io.FileInputStream; | |
30 | -import java.io.FileNotFoundException; | |
31 | -import java.io.FileOutputStream; | |
32 | -import java.io.IOException; | |
33 | -import java.io.InputStreamReader; | |
34 | -import java.io.OutputStreamWriter; | |
35 | -import java.util.ArrayList; | |
36 | -import java.util.logging.Level; | |
37 | -import java.util.logging.Logger; | |
38 | -import javax.xml.parsers.DocumentBuilder; | |
39 | -import javax.xml.parsers.DocumentBuilderFactory; | |
40 | -import javax.xml.parsers.ParserConfigurationException; | |
41 | -import javax.xml.transform.Transformer; | |
42 | -import javax.xml.transform.TransformerConfigurationException; | |
43 | -import javax.xml.transform.TransformerException; | |
44 | -import javax.xml.transform.TransformerFactory; | |
45 | -import javax.xml.transform.dom.DOMSource; | |
46 | -import javax.xml.transform.stream.StreamResult; | |
47 | -import org.w3c.dom.DOMImplementation; | |
48 | -import org.w3c.dom.Document; | |
49 | -import org.w3c.dom.Element; | |
50 | -import org.w3c.dom.Node; | |
51 | -import org.w3c.dom.NodeList; | |
52 | -import org.xml.sax.SAXException; | |
53 | - | |
54 | -/** | |
55 | - * | |
56 | - * @author kgto | |
57 | - */ | |
58 | -public class SearchDataRW { | |
59 | - | |
60 | - DocumentBuilder builder; | |
61 | - public Document document; | |
62 | - Element root; | |
63 | - | |
64 | - private final String splitchar = "\t"; | |
65 | - | |
66 | - private String UrlAdress; | |
67 | - private ArrayList<SearchData> slist = new ArrayList<>(); | |
68 | - | |
69 | - public SearchDataRW() { | |
70 | - try { | |
71 | - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
72 | - builder = factory.newDocumentBuilder(); | |
73 | - | |
74 | - } catch (ParserConfigurationException ex) { | |
75 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
76 | - } | |
77 | - } | |
78 | - | |
79 | - public void seturl(String UrlAdress) { | |
80 | - this.UrlAdress = UrlAdress; | |
81 | - } | |
82 | - | |
83 | - public void setslist(ArrayList slist) { | |
84 | - this.slist = slist; | |
85 | - } | |
86 | - | |
87 | - public String geturl() { | |
88 | - return UrlAdress; | |
89 | - } | |
90 | - | |
91 | - public ArrayList getslist() { | |
92 | - return slist; | |
93 | - } | |
94 | - | |
95 | - /** | |
96 | - * 保存. | |
97 | - * @param file | |
98 | - */ | |
99 | - public void save(File file) { | |
100 | - //saveCsv(file); | |
101 | - //saveXml(file); | |
102 | - | |
103 | - saveUrl(UrlAdress); | |
104 | - saveSearchList(slist); | |
105 | - write(file); | |
106 | - } | |
107 | - | |
108 | - /** | |
109 | - * 読込. | |
110 | - * @param file | |
111 | - */ | |
112 | - public void load(File file) { | |
113 | - //loadCsv(file); | |
114 | - //loadXml(file); | |
115 | - | |
116 | - read(file); | |
117 | - loadUrl(); | |
118 | - loadSearchList(); | |
119 | - } | |
120 | - | |
121 | - /* ---------------------------------------------------------------------- */ | |
122 | - /** | |
123 | - * 保存(CSV形式). | |
124 | - * @param file | |
125 | - */ | |
126 | - public void saveCsv(File file) { | |
127 | - BufferedWriter bufferedwriter = null; | |
128 | - try { | |
129 | - //空のファイルを作成 | |
130 | - file.createNewFile(); | |
131 | - FileOutputStream fileoutputstream = new FileOutputStream(file); | |
132 | - OutputStreamWriter outputstreamwriter = new OutputStreamWriter(fileoutputstream, "UTF-8"); | |
133 | - bufferedwriter = new BufferedWriter(outputstreamwriter); | |
134 | - | |
135 | - // URL | |
136 | - bufferedwriter.write(UrlAdress); | |
137 | - bufferedwriter.write("\n"); | |
138 | - // 検索情報 | |
139 | - for(Object slist1 : slist) { | |
140 | - SearchData sdat = (SearchData)slist1; | |
141 | - // | |
142 | - StringBuilder str = new StringBuilder(); | |
143 | - str.append(sdat.getitem()).append(splitchar); | |
144 | - str.append(sdat.getHtmltag()).append(splitchar); | |
145 | - str.append(sdat.getHtmlid()).append(splitchar); | |
146 | - str.append(sdat.getHtmlclass()).append(splitchar); | |
147 | - str.append(sdat.getaround()).append(splitchar); | |
148 | - str.append(sdat.getregexp()).append("\n"); | |
149 | - // 書込み | |
150 | - bufferedwriter.write(str.toString()); | |
151 | - } | |
152 | - | |
153 | - } catch (IOException ex) { | |
154 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
155 | - } finally { | |
156 | - try { | |
157 | - if(bufferedwriter != null) { | |
158 | - bufferedwriter.close(); | |
159 | - } | |
160 | - | |
161 | - } catch (IOException ex) { | |
162 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
163 | - } | |
164 | - } | |
165 | - } | |
166 | - | |
167 | - /** | |
168 | - * 読込(CSV形式). | |
169 | - * @param file | |
170 | - */ | |
171 | - public void loadCsv(File file) { | |
172 | - slist = new ArrayList(); | |
173 | - | |
174 | - BufferedReader bufferedreader = null; | |
175 | - try { | |
176 | - FileInputStream fileinputstream = new FileInputStream(file); | |
177 | - InputStreamReader inputstreamreader = new InputStreamReader(fileinputstream, "UTF-8"); | |
178 | - bufferedreader = new BufferedReader(inputstreamreader); | |
179 | - | |
180 | - // URL | |
181 | - UrlAdress = bufferedreader.readLine(); | |
182 | - // 検索情報 | |
183 | - String rec; | |
184 | - while((rec = bufferedreader.readLine()) != null) { | |
185 | - String[] recary = rec.split(splitchar, -1); | |
186 | - SearchData sdat = new SearchData(); | |
187 | - sdat.setitem(recary[0]); | |
188 | - sdat.setHtmltag(recary[1]); | |
189 | - sdat.setHtmlid(recary[2]); | |
190 | - sdat.setHtmlclass(recary[3]); | |
191 | - sdat.setaround(recary[4]); | |
192 | - sdat.setregexp(recary[5]); | |
193 | - | |
194 | - slist.add(sdat); | |
195 | - } | |
196 | - | |
197 | - } catch(IOException ex) { | |
198 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
199 | - | |
200 | - } finally { | |
201 | - try { | |
202 | - if(bufferedreader != null) { | |
203 | - bufferedreader.close(); | |
204 | - } | |
205 | - | |
206 | - } catch (IOException ex) { | |
207 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
208 | - } | |
209 | - } | |
210 | - } | |
211 | - | |
212 | - /* ---------------------------------------------------------------------- */ | |
213 | - /** | |
214 | - * 保存(XML形式). | |
215 | - * @param file | |
216 | - */ | |
217 | - public void saveXml(File file) { | |
218 | - try { | |
219 | - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
220 | - DocumentBuilder wkBuilder = factory.newDocumentBuilder(); | |
221 | - DOMImplementation domImpl = wkBuilder.getDOMImplementation(); | |
222 | - | |
223 | - Document doc = domImpl.createDocument("","searchdata",null); | |
224 | - Element wkRoot = doc.getDocumentElement(); | |
225 | - | |
226 | - // URL | |
227 | - Element url = doc.createElement("url"); | |
228 | - url.appendChild(doc.createTextNode(UrlAdress)); | |
229 | - wkRoot.appendChild(url); | |
230 | - | |
231 | - // 検索情報 | |
232 | - for (Object slist1 : slist) { | |
233 | - SearchData sdat = (SearchData) slist1; | |
234 | - | |
235 | - Element cslist = doc.createElement("searchlist"); | |
236 | - Element item = doc.createElement("item"); | |
237 | - Element htmltag = doc.createElement("htmltag"); | |
238 | - Element htmlid = doc.createElement("htmlid"); | |
239 | - Element htmlclass = doc.createElement("htmlclass"); | |
240 | - Element around = doc.createElement("around"); | |
241 | - Element regexp = doc.createElement("regexp"); | |
242 | - | |
243 | - item.appendChild(doc.createTextNode(sdat.getitem())); | |
244 | - htmltag.appendChild(doc.createTextNode(sdat.getHtmltag())); | |
245 | - htmlid.appendChild(doc.createTextNode(sdat.getHtmlid())); | |
246 | - htmlclass.appendChild(doc.createTextNode(sdat.getHtmlclass())); | |
247 | - around.appendChild(doc.createTextNode(sdat.getaround())); | |
248 | - regexp.appendChild(doc.createTextNode(sdat.getregexp())); | |
249 | - | |
250 | - cslist.appendChild(item); | |
251 | - cslist.appendChild(htmltag); | |
252 | - cslist.appendChild(htmlid); | |
253 | - cslist.appendChild(htmlclass); | |
254 | - cslist.appendChild(around); | |
255 | - cslist.appendChild(regexp); | |
256 | - | |
257 | - wkRoot.appendChild(cslist); | |
258 | - } | |
259 | - // 出力 | |
260 | - TransformerFactory transFactory = TransformerFactory.newInstance(); | |
261 | - Transformer transformer = transFactory.newTransformer(); | |
262 | - | |
263 | - DOMSource source = new DOMSource(doc); | |
264 | - FileOutputStream os = new FileOutputStream(file); | |
265 | - StreamResult result = new StreamResult(os); | |
266 | - transformer.transform(source, result); | |
267 | - | |
268 | - } catch (ParserConfigurationException | FileNotFoundException ex) { | |
269 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
270 | - } catch (TransformerConfigurationException ex) { | |
271 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
272 | - } catch (TransformerException ex) { | |
273 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
274 | - } | |
275 | - } | |
276 | - | |
277 | - /** | |
278 | - * 読込(XML形式). | |
279 | - * @param file | |
280 | - */ | |
281 | - public void loadXml(File file) { | |
282 | - slist = new ArrayList(); | |
283 | - | |
284 | - try { | |
285 | - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
286 | - DocumentBuilder wkBuilder = factory.newDocumentBuilder(); | |
287 | - Document doc = wkBuilder.parse(file); | |
288 | - | |
289 | - // ルート要素の取得 | |
290 | - Element wkRoot = doc.getDocumentElement(); | |
291 | - | |
292 | - // URL | |
293 | - NodeList url = wkRoot.getElementsByTagName("url"); | |
294 | - Node urlnode = url.item(0); | |
295 | - UrlAdress = urlnode.getFirstChild().getNodeValue(); | |
296 | - | |
297 | - // 検索情報 | |
298 | - NodeList cslist = wkRoot.getElementsByTagName("searchlist"); | |
299 | - for(int i = 0; i < cslist.getLength(); i++) { | |
300 | - SearchData sdat = new SearchData(); | |
301 | - | |
302 | - Node slistnode = cslist.item(i); | |
303 | - Node child; | |
304 | - for (child = slistnode.getFirstChild(); child != null; child = child.getNextSibling()) { | |
305 | - if(child.getNodeType() == Node.ELEMENT_NODE) { | |
306 | - | |
307 | - String tag = child.getNodeName(); | |
308 | - String rtn = ""; | |
309 | - if(child.getFirstChild() != null) { | |
310 | - rtn = child.getFirstChild().getNodeValue(); | |
311 | - } | |
312 | - | |
313 | - switch (tag) { | |
314 | - case "item" : | |
315 | - sdat.setitem(rtn); | |
316 | - break; | |
317 | - case "htmltag" : | |
318 | - sdat.setHtmltag(rtn); | |
319 | - break; | |
320 | - case "htmlid" : | |
321 | - sdat.setHtmlid(rtn); | |
322 | - break; | |
323 | - case "htmlclass" : | |
324 | - sdat.setHtmlclass(rtn); | |
325 | - break; | |
326 | - case "around" : | |
327 | - sdat.setaround(rtn); | |
328 | - break; | |
329 | - case "regexp" : | |
330 | - sdat.setregexp(rtn); | |
331 | - break; | |
332 | - } | |
333 | - } | |
334 | - } | |
335 | - slist.add(sdat); | |
336 | - } | |
337 | - | |
338 | - } catch (ParserConfigurationException | SAXException | IOException ex) { | |
339 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
340 | - } | |
341 | - } | |
342 | - | |
343 | - /* ---------------------------------------------------------------------- */ | |
344 | - | |
345 | - void loadUrl() { | |
346 | - NodeList nodelist = root.getElementsByTagName("url"); | |
347 | - Node node = nodelist.item(0); | |
348 | - UrlAdress = node.getFirstChild().getNodeValue(); | |
349 | - } | |
350 | - | |
351 | - public void loadSearchList() { | |
352 | - slist.clear(); | |
353 | - SearchData.clear(); | |
354 | - | |
355 | - NodeList nodelist = root.getElementsByTagName("searchlist"); | |
356 | - for(int i = 0; i < nodelist.getLength(); i++) { | |
357 | - Node childnode = nodelist.item(i); | |
358 | - | |
359 | - boolean sdatflg = false; | |
360 | - SearchData sdat = new SearchData(); | |
361 | - for (Node child = childnode.getFirstChild(); child != null; child = child.getNextSibling()) { | |
362 | - if(child.getNodeType() == Node.ELEMENT_NODE) { | |
363 | - String tag = child.getNodeName(); | |
364 | - String rtn = ""; | |
365 | - if(child.getFirstChild() != null) { | |
366 | - rtn = child.getFirstChild().getNodeValue(); | |
367 | - } | |
368 | - switch (tag) { | |
369 | - case "item" : | |
370 | - sdat.setitem(rtn); | |
371 | - sdatflg = true; | |
372 | - break; | |
373 | - case "htmltag" : | |
374 | - sdat.setHtmltag(rtn); | |
375 | - sdatflg = true; | |
376 | - break; | |
377 | - case "htmlid" : | |
378 | - sdat.setHtmlid(rtn); | |
379 | - sdatflg = true; | |
380 | - break; | |
381 | - case "htmlclass" : | |
382 | - sdat.setHtmlclass(rtn); | |
383 | - sdatflg = true; | |
384 | - break; | |
385 | - case "around" : | |
386 | - sdat.setaround(rtn); | |
387 | - sdatflg = true; | |
388 | - break; | |
389 | - case "regexp" : | |
390 | - sdat.setregexp(rtn); | |
391 | - sdatflg = true; | |
392 | - break; | |
393 | - } | |
394 | - } | |
395 | - } | |
396 | - if(sdatflg) slist.add(sdat); | |
397 | - if(sdatflg) SearchData.add(sdat); | |
398 | - } | |
399 | - } | |
400 | - | |
401 | - public String loadMsg404() { | |
402 | - StringBuilder strbuf = new StringBuilder(); | |
403 | - NodeList nodelist = root.getElementsByTagName("msg404"); | |
404 | - for(int i = 0; i < nodelist.getLength(); i++) { | |
405 | - Node childnode = nodelist.item(i); | |
406 | - String str = childnode.getFirstChild().getNodeValue(); | |
407 | - if(strbuf.length() > 0) { | |
408 | - strbuf.append("\n"); | |
409 | - } | |
410 | - strbuf.append(str); | |
411 | - } | |
412 | - return strbuf.toString(); | |
413 | - } | |
414 | - | |
415 | - public Element loadElement(String elementTagName) { | |
416 | - NodeList nodelist = root.getElementsByTagName(elementTagName); | |
417 | - Element element = (Element)nodelist.item(0); | |
418 | - | |
419 | - return element; | |
420 | - } | |
421 | - | |
422 | - /* ---------------------------------------------------------------------- */ | |
423 | - | |
424 | - void saveUrl(String urladdress) { | |
425 | - checkdoc(); | |
426 | - removeElement("url"); // 既にElementが存在してた場合、一度削除 | |
427 | - | |
428 | - Element url = document.createElement("url"); | |
429 | - url.appendChild(document.createTextNode(urladdress)); | |
430 | - root.appendChild(url); | |
431 | - } | |
432 | - | |
433 | - void saveSearchList(ArrayList slist) { | |
434 | - checkdoc(); | |
435 | - removeElement("searchlist"); // 既にElementが存在してた場合、一度削除 | |
436 | - | |
437 | - int count = 0; | |
438 | - for (Object slist1 : slist) { | |
439 | - SearchData sdat = (SearchData) slist1; | |
440 | - | |
441 | - Element cslist = document.createElement("searchlist"); | |
442 | - cslist.setAttribute("listNo", String.valueOf(++count)); | |
443 | - | |
444 | - addChild(cslist, "item", sdat.getitem()); | |
445 | - addChild(cslist, "htmltag", sdat.getHtmltag()); | |
446 | - addChild(cslist, "htmlid", sdat.getHtmlid()); | |
447 | - addChild(cslist, "htmlclass", sdat.getHtmlclass()); | |
448 | - addChild(cslist, "around", sdat.getaround()); | |
449 | - addChild(cslist, "regexp", sdat.getregexp()); | |
450 | - | |
451 | - root.appendChild(cslist); | |
452 | - } | |
453 | - } | |
454 | - | |
455 | - void saveMsg404(String msg) { | |
456 | - checkdoc(); | |
457 | - removeElement("msg404"); // 既にElementが存在してた場合、一度削除 | |
458 | - | |
459 | - String[] msgs = msg.split("\n"); | |
460 | - int count = 0; | |
461 | - for(String msgOne : msgs) { | |
462 | - Element msgElement = document.createElement("msg404"); | |
463 | - msgElement.setAttribute("No", String.valueOf(++count)); | |
464 | - msgElement.appendChild(document.createTextNode(msgOne)); | |
465 | - | |
466 | - root.appendChild(msgElement); | |
467 | - } | |
468 | - } | |
469 | - | |
470 | - public void saveElement(Element element) { | |
471 | - checkdoc(); | |
472 | - removeElement(element.getTagName()); // 既にElementが存在してた場合、一度削除 | |
473 | - | |
474 | - root.appendChild(element); | |
475 | - } | |
476 | - | |
477 | - /* ---------------------------------------------------------------------- */ | |
478 | - | |
479 | - private void addChild(Element cslist, String keyword, String data) { | |
480 | - if(!data.isEmpty()) { | |
481 | - Element element = document.createElement(keyword); | |
482 | - element.appendChild(document.createTextNode(data)); | |
483 | - cslist.appendChild(element); | |
484 | - } | |
485 | - } | |
486 | - | |
487 | - private void removeElement(String elementTagName) { | |
488 | - int nodeSize; | |
489 | - do { | |
490 | - NodeList nodelist = document.getElementsByTagName(elementTagName); | |
491 | - nodeSize = nodelist.getLength(); | |
492 | - for(int i = 0; i < nodelist.getLength(); i++) { | |
493 | - Node node = nodelist.item(i); | |
494 | - root.removeChild(node); | |
495 | - } | |
496 | - } while(nodeSize > 0); | |
497 | - } | |
498 | - | |
499 | - /** | |
500 | - * ドキュメントチェック. | |
501 | - * 新規の場合やXMLファイルの読込みが行われていない状態時、新たにルートエレメントを作成する。 | |
502 | - * 既読の場合、ルートエレメントの取得を行う。 | |
503 | - */ | |
504 | - public void checkdoc() { | |
505 | - if(document == null) { | |
506 | - DOMImplementation domImpl = builder.getDOMImplementation(); | |
507 | - document = domImpl.createDocument("","searchdata",null); | |
508 | - } | |
509 | - root = document.getDocumentElement(); | |
510 | - } | |
511 | - | |
512 | - /** | |
513 | - * XML読込み. | |
514 | - * @param file | |
515 | - */ | |
516 | - public void read(File file) { | |
517 | - try { | |
518 | - document = builder.parse(file); | |
519 | - root = document.getDocumentElement(); | |
520 | - | |
521 | - } catch (SAXException | IOException ex) { | |
522 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
523 | - } | |
524 | - } | |
525 | - | |
526 | - /** | |
527 | - * XML書込み. | |
528 | - * @param file | |
529 | - */ | |
530 | - public void write(File file) { | |
531 | - try { | |
532 | - TransformerFactory transFactory = TransformerFactory.newInstance(); | |
533 | - Transformer transformer = transFactory.newTransformer(); | |
534 | - | |
535 | - DOMSource source = new DOMSource(document); | |
536 | - FileOutputStream os = new FileOutputStream(file); | |
537 | - StreamResult result = new StreamResult(os); | |
538 | - transformer.transform(source, result); | |
539 | - | |
540 | - } catch (TransformerConfigurationException ex) { | |
541 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
542 | - } catch (FileNotFoundException | TransformerException ex) { | |
543 | - Logger.getLogger(SearchDataRW.class.getName()).log(Level.SEVERE, null, ex); | |
544 | - } | |
545 | - } | |
546 | - | |
547 | -} |
@@ -0,0 +1,142 @@ | ||
1 | +/* | |
2 | + * Copyright (C) 2014-2015 kgto. | |
3 | + * | |
4 | + * This library is free software; you can redistribute it and/or | |
5 | + * modify it under the terms of the GNU Lesser General Public | |
6 | + * License as published by the Free Software Foundation; either | |
7 | + * version 2.1 of the License, or (at your option) any later version. | |
8 | + * | |
9 | + * This library is distributed in the hope that it will be useful, | |
10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 | + * Lesser General Public License for more details. | |
13 | + * | |
14 | + * You should have received a copy of the GNU Lesser General Public | |
15 | + * License along with this library; if not, write to the Free Software | |
16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
17 | + * MA 02110-1301 USA | |
18 | + */ | |
19 | +/* | |
20 | + * $Id$ | |
21 | + */ | |
22 | + | |
23 | +package webScraping.utility; | |
24 | + | |
25 | +import java.io.File; | |
26 | +import java.io.FileNotFoundException; | |
27 | +import java.io.FileOutputStream; | |
28 | +import java.io.IOException; | |
29 | +import java.util.logging.Level; | |
30 | +import java.util.logging.Logger; | |
31 | + | |
32 | +import javax.xml.parsers.DocumentBuilder; | |
33 | +import javax.xml.parsers.DocumentBuilderFactory; | |
34 | +import javax.xml.parsers.ParserConfigurationException; | |
35 | +import javax.xml.transform.Transformer; | |
36 | +import javax.xml.transform.TransformerConfigurationException; | |
37 | +import javax.xml.transform.TransformerException; | |
38 | +import javax.xml.transform.TransformerFactory; | |
39 | +import javax.xml.transform.dom.DOMSource; | |
40 | +import javax.xml.transform.stream.StreamResult; | |
41 | + | |
42 | +import org.w3c.dom.DOMImplementation; | |
43 | +import org.w3c.dom.Document; | |
44 | +import org.w3c.dom.Element; | |
45 | +import org.w3c.dom.Node; | |
46 | +import org.w3c.dom.NodeList; | |
47 | +import org.xml.sax.SAXException; | |
48 | + | |
49 | +public class LibraryXml { | |
50 | + | |
51 | + String xmlrootname = "xmlcontainer"; | |
52 | + | |
53 | + DocumentBuilder builder; | |
54 | + public Document readdoc, writedoc; | |
55 | + Element xmlroot; | |
56 | + | |
57 | + /* ---------------------------------------------------------------------- * | |
58 | + * コンストラクタ | |
59 | + * ---------------------------------------------------------------------- */ | |
60 | + public LibraryXml() { | |
61 | + try { | |
62 | + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
63 | + builder = factory.newDocumentBuilder(); | |
64 | + | |
65 | + } catch (ParserConfigurationException ex) { | |
66 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
67 | + } | |
68 | + } | |
69 | + | |
70 | + /* ---------------------------------------------------------------------- * | |
71 | + * メソッド | |
72 | + * ---------------------------------------------------------------------- */ | |
73 | + /* 読込み処理 */ | |
74 | + public Element getwriteRoot(String elementName) { | |
75 | + mainElement(); | |
76 | + Element element = writedoc.createElement(elementName); | |
77 | + xmlroot.appendChild(element); | |
78 | + return element; | |
79 | + } | |
80 | + | |
81 | + private void mainElement() { | |
82 | + if(writedoc == null) { | |
83 | + DOMImplementation domImpl = builder.getDOMImplementation(); | |
84 | + writedoc = domImpl.createDocument("", xmlrootname, null); | |
85 | + xmlroot = writedoc.getDocumentElement(); | |
86 | + } | |
87 | + } | |
88 | + | |
89 | + /** | |
90 | + * XML書込み. | |
91 | + * @param file | |
92 | + */ | |
93 | + public void write(File file) { | |
94 | + try (FileOutputStream os = new FileOutputStream(file)) { | |
95 | + TransformerFactory transFactory = TransformerFactory.newInstance(); | |
96 | + Transformer transformer = transFactory.newTransformer(); | |
97 | + | |
98 | + transformer.setOutputProperty("indent", "yes"); // 改行指定 | |
99 | + transformer.setOutputProperty("method", "xml"); | |
100 | + | |
101 | + DOMSource source = new DOMSource(writedoc); | |
102 | + StreamResult result = new StreamResult(os); | |
103 | + transformer.transform(source, result); | |
104 | + | |
105 | + // 作成したXMLをクリア | |
106 | + writedoc = null; | |
107 | + | |
108 | + } catch (TransformerConfigurationException ex) { | |
109 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
110 | + } catch (FileNotFoundException | TransformerException ex) { | |
111 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
112 | + } catch (IOException ex) { | |
113 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
114 | + } | |
115 | + } | |
116 | + | |
117 | + /* ---------------------------------------------------------------------- */ | |
118 | + /* 書込み処理 */ | |
119 | + | |
120 | + public Element getreadRoot(String elementName) { | |
121 | + NodeList nodelist = xmlroot.getElementsByTagName(elementName); | |
122 | + Node node = nodelist.item(0); | |
123 | + return (node.getNodeType() == Node.ELEMENT_NODE ? (Element)node : null); | |
124 | + } | |
125 | + | |
126 | + /** | |
127 | + * XML読込み. | |
128 | + * @param file | |
129 | + */ | |
130 | + public void read(File file) { | |
131 | + try { | |
132 | + readdoc = builder.parse(file); | |
133 | + xmlroot = readdoc.getDocumentElement(); | |
134 | + | |
135 | + } catch (SAXException | IOException ex) { | |
136 | + Logger.getLogger(LibraryXml.class.getName()).log(Level.SEVERE, null, ex); | |
137 | + } | |
138 | + } | |
139 | + | |
140 | + /* ---------------------------------------------------------------------- */ | |
141 | + | |
142 | +} |
@@ -0,0 +1,198 @@ | ||
1 | +/* | |
2 | + * Copyright (C) 2014-2015 kgto. | |
3 | + * | |
4 | + * This library is free software; you can redistribute it and/or | |
5 | + * modify it under the terms of the GNU Lesser General Public | |
6 | + * License as published by the Free Software Foundation; either | |
7 | + * version 2.1 of the License, or (at your option) any later version. | |
8 | + * | |
9 | + * This library is distributed in the hope that it will be useful, | |
10 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 | + * Lesser General Public License for more details. | |
13 | + * | |
14 | + * You should have received a copy of the GNU Lesser General Public | |
15 | + * License along with this library; if not, write to the Free Software | |
16 | + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
17 | + * MA 02110-1301 USA | |
18 | + */ | |
19 | +/* | |
20 | + * $Id$ | |
21 | + */ | |
22 | + | |
23 | +package webScraping.utility; | |
24 | + | |
25 | +import webScraping.core.SearchData; | |
26 | +import java.io.File; | |
27 | +import java.util.ArrayList; | |
28 | +import org.w3c.dom.Element; | |
29 | +import org.w3c.dom.Node; | |
30 | +import org.w3c.dom.NodeList; | |
31 | + | |
32 | +public class ScrapingXml { | |
33 | + /* ---------------------------------------------------------------------- * | |
34 | + * フィールド | |
35 | + * ---------------------------------------------------------------------- */ | |
36 | + String rootnameScraping = "webscraping"; | |
37 | + | |
38 | + private String testUrl; | |
39 | + private SearchData[] sdata; | |
40 | + | |
41 | + public LibraryXml xlib = new LibraryXml(); | |
42 | + public Element root; | |
43 | + | |
44 | + /* ---------------------------------------------------------------------- * | |
45 | + * コンストラクタ | |
46 | + * ---------------------------------------------------------------------- */ | |
47 | + public ScrapingXml() { | |
48 | + } | |
49 | + | |
50 | + /* ---------------------------------------------------------------------- * | |
51 | + * Setter | |
52 | + * ---------------------------------------------------------------------- */ | |
53 | + public void setTestUrl(String testUrl) { | |
54 | + this.testUrl = testUrl; | |
55 | + } | |
56 | + | |
57 | + public void setSdata() { | |
58 | + this.sdata = new SearchData[SearchData.size()]; | |
59 | + for(int i = 0; i < SearchData.size(); i++) { | |
60 | + this.sdata[i] = SearchData.get(i); | |
61 | + } | |
62 | + } | |
63 | + | |
64 | + /* ---------------------------------------------------------------------- * | |
65 | + * Getter | |
66 | + * ---------------------------------------------------------------------- */ | |
67 | + public String getTestUrl() { | |
68 | + return testUrl; | |
69 | + } | |
70 | + | |
71 | + public void getSdata() { | |
72 | + SearchData.clear(); | |
73 | + for(SearchData sdata1 : sdata) { | |
74 | + SearchData.add(sdata1); | |
75 | + } | |
76 | + } | |
77 | + | |
78 | + /* ---------------------------------------------------------------------- * | |
79 | + * メソッド | |
80 | + * ---------------------------------------------------------------------- */ | |
81 | + public void save(File file) { | |
82 | + | |
83 | + elementset(); | |
84 | + | |
85 | + xlib.write(file); | |
86 | + } | |
87 | + | |
88 | + public void elementset() { | |
89 | + root = xlib.getwriteRoot(rootnameScraping); | |
90 | + elementsetUrl(); | |
91 | + elementsetSearchdata(); | |
92 | + System.out.println("elementset XmlScraping"); | |
93 | + } | |
94 | + | |
95 | + private void elementsetUrl() { | |
96 | + Element url = xlib.writedoc.createElement("url"); | |
97 | + url.appendChild(xlib.writedoc.createTextNode(testUrl)); | |
98 | + root.appendChild(url); | |
99 | + } | |
100 | + | |
101 | + private void elementsetSearchdata() { | |
102 | + int count = 0; | |
103 | + for(SearchData sdat : sdata) { | |
104 | + Element cslist = xlib.writedoc.createElement("searchlist"); | |
105 | + cslist.setAttribute("listNo", String.valueOf(++count)); | |
106 | + | |
107 | + addChild(cslist, "item" , sdat.getitem()); | |
108 | + addChild(cslist, "htmltag" , sdat.getHtmltag()); | |
109 | + addChild(cslist, "htmlid" , sdat.getHtmlid()); | |
110 | + addChild(cslist, "htmlclass", sdat.getHtmlclass()); | |
111 | + addChild(cslist, "around" , sdat.getaround()); | |
112 | + addChild(cslist, "regexp" , sdat.getregexp()); | |
113 | + | |
114 | + root.appendChild(cslist); | |
115 | + } | |
116 | + } | |
117 | + | |
118 | + private void addChild(Element cslist, String keyword, String data) { | |
119 | + if(!data.isEmpty()) { | |
120 | + Element element = xlib.writedoc.createElement(keyword); | |
121 | + element.appendChild(xlib.writedoc.createTextNode(data)); | |
122 | + cslist.appendChild(element); | |
123 | + } | |
124 | + } | |
125 | + | |
126 | + /* ---------------------------------------------------------------------- */ | |
127 | + | |
128 | + void load(File file) { | |
129 | + xlib.read(file); | |
130 | + elementget(); | |
131 | + } | |
132 | + | |
133 | + public void elementget() { | |
134 | + root = xlib.getreadRoot(rootnameScraping); | |
135 | + elementgetUrl(); | |
136 | + elementgetSearchdata(); | |
137 | + } | |
138 | + | |
139 | + private void elementgetUrl() { | |
140 | + NodeList nodelist = root.getElementsByTagName("url"); | |
141 | + Node node = nodelist.item(0); | |
142 | + testUrl = node.getFirstChild().getNodeValue(); | |
143 | + } | |
144 | + | |
145 | + private void elementgetSearchdata() { | |
146 | + ArrayList<SearchData> slist = new ArrayList<>(); | |
147 | + | |
148 | + NodeList nodelist = root.getElementsByTagName("searchlist"); | |
149 | + for(int i = 0; i < nodelist.getLength(); i++) { | |
150 | + Node childnode = nodelist.item(i); | |
151 | + | |
152 | + boolean sdatflg = false; | |
153 | + SearchData sdat = new SearchData(); | |
154 | + for (Node child = childnode.getFirstChild(); child != null; child = child.getNextSibling()) { | |
155 | + if(child.getNodeType() == Node.ELEMENT_NODE) { | |
156 | + String tag = child.getNodeName(); | |
157 | + String rtn = ""; | |
158 | + if(child.getFirstChild() != null) { | |
159 | + rtn = child.getFirstChild().getNodeValue(); | |
160 | + } | |
161 | + switch (tag) { | |
162 | + case "item" : | |
163 | + sdat.setitem(rtn); | |
164 | + sdatflg = true; | |
165 | + break; | |
166 | + case "htmltag" : | |
167 | + sdat.setHtmltag(rtn); | |
168 | + sdatflg = true; | |
169 | + break; | |
170 | + case "htmlid" : | |
171 | + sdat.setHtmlid(rtn); | |
172 | + sdatflg = true; | |
173 | + break; | |
174 | + case "htmlclass" : | |
175 | + sdat.setHtmlclass(rtn); | |
176 | + sdatflg = true; | |
177 | + break; | |
178 | + case "around" : | |
179 | + sdat.setaround(rtn); | |
180 | + sdatflg = true; | |
181 | + break; | |
182 | + case "regexp" : | |
183 | + sdat.setregexp(rtn); | |
184 | + sdatflg = true; | |
185 | + break; | |
186 | + } | |
187 | + } | |
188 | + } | |
189 | + if(sdatflg) slist.add(sdat); | |
190 | + } | |
191 | + // 配列化 | |
192 | + sdata = new SearchData[slist.size()]; | |
193 | + for(int i = 0; i < slist.size(); i++) { | |
194 | + sdata[i] = slist.get(i); | |
195 | + } | |
196 | + } | |
197 | + | |
198 | +} |
@@ -40,7 +40,7 @@ | ||
40 | 40 | * @author kgto |
41 | 41 | */ |
42 | 42 | public class HtmlSearch extends javax.swing.JFrame { |
43 | - private final SearchDataRW sio = new SearchDataRW(); | |
43 | + private final ScrapingXml xmlwriter = new ScrapingXml(); | |
44 | 44 | |
45 | 45 | SearchDataTableModel sdatatblmodel; |
46 | 46 |
@@ -332,8 +332,9 @@ | ||
332 | 332 | int selected = jFileChooser1.showOpenDialog(this); |
333 | 333 | if (selected == JFileChooser.APPROVE_OPTION) { |
334 | 334 | File file = jFileChooser1.getSelectedFile(); |
335 | - sio.load(file); | |
336 | - jTxtUrl.setText(sio.geturl()); | |
335 | + xmlwriter.load(file); | |
336 | + jTxtUrl.setText(xmlwriter.getTestUrl()); | |
337 | + xmlwriter.getSdata(); | |
337 | 338 | sdatatblmodel.setRowCount(0); |
338 | 339 | for(int i = 0; i < SearchData.size(); i++) { |
339 | 340 | SearchData sdata = SearchData.get(i); |
@@ -347,7 +348,7 @@ | ||
347 | 348 | int selected = jFileChooser1.showSaveDialog(this); |
348 | 349 | if (selected == JFileChooser.APPROVE_OPTION) { |
349 | 350 | File file = jFileChooser1.getSelectedFile(); |
350 | - sio.seturl(jTxtUrl.getText()); | |
351 | + xmlwriter.setTestUrl(jTxtUrl.getText()); | |
351 | 352 | |
352 | 353 | SearchData.clear(); |
353 | 354 | for(int row = 0; row < sdatatblmodel.getRowCount(); row++) { |
@@ -354,7 +355,8 @@ | ||
354 | 355 | SearchData sdata = sdatatblmodel.getSearchData(row); |
355 | 356 | SearchData.add(sdata); |
356 | 357 | } |
357 | - sio.save(file); | |
358 | + xmlwriter.setSdata(); | |
359 | + xmlwriter.save(file); | |
358 | 360 | } |
359 | 361 | }//GEN-LAST:event_jMenuSaveActionPerformed |
360 | 362 |
@@ -76,7 +76,8 @@ | ||
76 | 76 | for (Object AttrList1 : AttrList) { |
77 | 77 | AttrData a = (AttrData)AttrList1; |
78 | 78 | if(a.tag == tag) { |
79 | - if(a.attrname.equals(attrname) && a.attrvalue.equals(attrvalue)) { | |
79 | + //if(a.attrname.equals(attrname) && a.attrvalue.equals(attrvalue)) { | |
80 | + if(a.attrname.equals(attrname) && a.attrvalue.startsWith(attrvalue)) { | |
80 | 81 | ret = true; |
81 | 82 | } |
82 | 83 | } |
@@ -33,7 +33,9 @@ | ||
33 | 33 | * @author kgto |
34 | 34 | */ |
35 | 35 | class HtmlParserCallback extends HTMLEditorKit.ParserCallback { |
36 | - | |
36 | + /* ---------------------------------------------------------------------- * | |
37 | + * フィールド | |
38 | + * ---------------------------------------------------------------------- */ | |
37 | 39 | // Tag毎の階層 |
38 | 40 | HashMap<HTML.Tag,Integer> tagMap = new HashMap<>(); |
39 | 41 |
@@ -54,6 +56,9 @@ | ||
54 | 56 | // 属性データ |
55 | 57 | AttributeData attrdata; |
56 | 58 | |
59 | + /* ---------------------------------------------------------------------- * | |
60 | + * コンストラクタ | |
61 | + * ---------------------------------------------------------------------- */ | |
57 | 62 | protected HtmlParserCallback(SearchData skey) { |
58 | 63 | |
59 | 64 | // キー情報展開 |
@@ -64,10 +69,16 @@ | ||
64 | 69 | sData = new ArrayList(); |
65 | 70 | } |
66 | 71 | |
72 | + /* ---------------------------------------------------------------------- * | |
73 | + * Getter | |
74 | + * ---------------------------------------------------------------------- */ | |
67 | 75 | ArrayList getrtnData() { |
68 | 76 | return this.sData; |
69 | 77 | } |
70 | 78 | |
79 | + /* ---------------------------------------------------------------------- * | |
80 | + * メソッド | |
81 | + * ---------------------------------------------------------------------- */ | |
71 | 82 | @Override |
72 | 83 | public void handleStartTag(HTML.Tag tag, MutableAttributeSet attr, int pos){ |
73 | 84 | // Tag毎の階層を保持 |
@@ -42,6 +42,28 @@ | ||
42 | 42 | /* ---------------------------------------------------------------------- * |
43 | 43 | * static 処理 |
44 | 44 | * ---------------------------------------------------------------------- */ |
45 | + public static class Context { | |
46 | + public Class columnClass; | |
47 | + public String columnName; | |
48 | + public String columnNameJp; | |
49 | + | |
50 | + public Context(Class columnClass, String columnName, String columnNameJp) { | |
51 | + this.columnClass = columnClass; | |
52 | + this.columnName = columnName; | |
53 | + this.columnNameJp = columnNameJp; | |
54 | + } | |
55 | + } | |
56 | + | |
57 | + public static final Context[] context = { | |
58 | + /* 0 */ new Context(String.class , "item" , "項目名"), | |
59 | + /* 1 */ new Context(String.class , "htmltag" , "タグ"), | |
60 | + /* 2 */ new Context(String.class , "htmlid" , "ID"), | |
61 | + /* 3 */ new Context(String.class , "htmlclass" , "クラス"), | |
62 | + /* 4 */ new Context(String.class , "around" , "位置"), | |
63 | + /* 5 */ new Context(String.class , "regexp" , "抽出条件") | |
64 | + }; | |
65 | + | |
66 | + /* ---------------------------------------------------------------------- */ | |
45 | 67 | private static ArrayList<SearchData> slist = new ArrayList<>(); |
46 | 68 | |
47 | 69 | public static void addSearchData( |
@@ -162,5 +184,17 @@ | ||
162 | 184 | this.around = ""; |
163 | 185 | this.regexp = ""; |
164 | 186 | } |
165 | - | |
187 | + | |
188 | + public Object[] getObjData() { | |
189 | + Object[] obj = { | |
190 | + /* 0 */ getitem(), // 項目名 | |
191 | + /* 1 */ getHtmltag(), // タグ | |
192 | + /* 2 */ getHtmlid(), // ID | |
193 | + /* 3 */ getHtmlclass(), // クラス | |
194 | + /* 4 */ getaround(), // 位置 | |
195 | + /* 5 */ getregexp() // 抽出条件 | |
196 | + }; | |
197 | + return obj; | |
198 | + } | |
199 | + | |
166 | 200 | } |
@@ -32,20 +32,25 @@ | ||
32 | 32 | import javax.swing.text.html.parser.ParserDelegator; |
33 | 33 | |
34 | 34 | /** |
35 | - * | |
35 | + * HTMLパーサ. | |
36 | 36 | * @author kgto |
37 | 37 | */ |
38 | 38 | public class HtmlParser { |
39 | - | |
39 | + /* ---------------------------------------------------------------------- * | |
40 | + * フィールド | |
41 | + * ---------------------------------------------------------------------- */ | |
40 | 42 | URL url; |
41 | 43 | String pageData; |
42 | 44 | ArrayList sData; |
43 | 45 | |
44 | 46 | // 作業ワーク |
45 | - String htmltag; | |
46 | - String htmlid; | |
47 | - String htmlclass; | |
47 | + private String htmltag; | |
48 | + private String htmlid; | |
49 | + private String htmlclass; | |
48 | 50 | |
51 | + /* ---------------------------------------------------------------------- * | |
52 | + * コンストラクタ | |
53 | + * ---------------------------------------------------------------------- */ | |
49 | 54 | public HtmlParser(URL UrlAdress) { |
50 | 55 | DebugProcess.debuglog_set(); |
51 | 56 | this.url = UrlAdress; |
@@ -68,15 +73,24 @@ | ||
68 | 73 | url = null; |
69 | 74 | } |
70 | 75 | |
76 | + /* ---------------------------------------------------------------------- * | |
77 | + * Getter | |
78 | + * ---------------------------------------------------------------------- */ | |
71 | 79 | public String getStringPageData() { |
72 | 80 | return pageData; |
73 | 81 | } |
74 | 82 | |
83 | + /* ---------------------------------------------------------------------- * | |
84 | + * Setter | |
85 | + * ---------------------------------------------------------------------- */ | |
75 | 86 | public void seturl(URL UrlAdress) { |
76 | 87 | this.url = UrlAdress; |
77 | 88 | getPageData(); |
78 | 89 | } |
79 | 90 | |
91 | + /* ---------------------------------------------------------------------- * | |
92 | + * メソッド | |
93 | + * ---------------------------------------------------------------------- */ | |
80 | 94 | public void seturl(String UrlAdress) { |
81 | 95 | try { |
82 | 96 | url = new URL(UrlAdress); |