news4 - RSS aggrigation system
修订版 | 2244893f997089e97f35e075de10ccad4cb33c5f (tree) |
---|---|
时间 | 2012-11-07 05:33:20 |
作者 | hylom <hylom@hylo...> |
Commiter | hylom |
Merge branch 'master' into live
@@ -14,9 +14,12 @@ | ||
14 | 14 | gnewsでサイトを生成するために、下記の設定ファイルが必要です。 |
15 | 15 | |
16 | 16 | === config.py === |
17 | - HTMLの生成先やサイト名、RSSの取得先などを設定するファイルです。PythonのDictionaryおよびArray形式で記述されています。 | |
17 | + HTMLの生成先やサイト名などを設定するファイルです。PythonのDictionaryおよびArray形式で記述されています。 | |
18 | 18 | config.py.sampleをコピーしてconfig.pyを作成し、編集します。 |
19 | 19 | |
20 | +=== sources.ini === | |
21 | + RSSの取得先を設定するファイルです。ini形式で記述されています。セクション名が表示されるサイト名、urlパラメータがサイトのURL、sourcesパラメータがRSSの取得先、filtersが使用するフィルタ一覧(カンマ区切り)となります。 | |
22 | + | |
20 | 23 | === install.conf === |
21 | 24 | 関連ファイルのコピー先を指定するファイルです。 |
22 | 25 | install.conf.sampleをコピーしてintall.confを作成し、編集します。 |
@@ -7,6 +7,7 @@ config = { | ||
7 | 7 | 'output_directory': 'outputs', |
8 | 8 | 'filter_directory': 'filters', |
9 | 9 | 'pagination_unit': 20, |
10 | + 'log_level': 0, | |
10 | 11 | 'index': { |
11 | 12 | 'template': 'index.tmpl.html', |
12 | 13 | 'output_directory': 'outputs/', |
@@ -30,59 +31,7 @@ config = { | ||
30 | 31 | 'post_filters': [ |
31 | 32 | 'cleanup', |
32 | 33 | 'trimming', |
34 | + 'remove_tracker' | |
33 | 35 | ], |
34 | 36 | } |
35 | 37 | |
36 | -target_rss = [ | |
37 | - { | |
38 | - 'name': 'SourceForge.JP Magazine', | |
39 | - 'url': 'http://rss.rssad.jp/rss/sourceforge/magazine/rss', | |
40 | - 'source_url': 'http://sourceforge.jp/magazine/', | |
41 | - }, | |
42 | - { | |
43 | - 'name': 'Slashdot Japan', | |
44 | - 'url': 'http://rss.rssad.jp/rss/slashdot/slashdot.rss', | |
45 | - 'source_url': 'http://slashdot.jp/', | |
46 | - 'filter': ['slashdotjp',], | |
47 | - }, | |
48 | - { | |
49 | - 'name': 'ITmedia', | |
50 | - 'url': 'http://rss.rssad.jp/rss/itmtop/2.0/itmedia_all.xml', | |
51 | - 'source_url': 'http://www.itmedia.co.jp/', | |
52 | - 'filter': ['tagging', 'itmedia'], | |
53 | - }, | |
54 | - { | |
55 | - 'name': 'So-netセキュリティ通信', | |
56 | - 'url': 'http://security-t.blog.so-net.ne.jp/index.xml', | |
57 | - 'source_url': 'http://security-t.blog.so-net.ne.jp/', | |
58 | - 'filter': ['tagging',], | |
59 | - }, | |
60 | - { | |
61 | - 'name': 'Engadget Japanese', | |
62 | - 'url': 'http://japanese.engadget.com/rss.xml', | |
63 | - 'source_url': 'http://japanese.engadget.com/', | |
64 | - 'filter': ['tagging',], | |
65 | - }, | |
66 | - { | |
67 | - 'name': 'ギズモード・ジャパン', | |
68 | - 'url': 'http://feeds.gizmodo.jp/rss/gizmodo/index.xml', | |
69 | - 'source_url': 'http://www.gizmodo.jp/', | |
70 | - 'filter': ['tagging',], | |
71 | - }, | |
72 | - { | |
73 | - 'name': 'TechCrunch Japan', | |
74 | - 'url': 'http://jp.techcrunch.com/feed/', | |
75 | - 'source_url': 'http://jp.techcrunch.com/', | |
76 | - 'filter': ['tagging',], | |
77 | - }, | |
78 | - ] | |
79 | - | |
80 | - | |
81 | -"""Template: | |
82 | - { | |
83 | - 'name': '', | |
84 | - 'url': '', | |
85 | - 'source_url': '', | |
86 | - 'filter': ['tagging',], | |
87 | - }, | |
88 | -""" |
@@ -0,0 +1,21 @@ | ||
1 | +#!/usr/bin/python | |
2 | + | |
3 | +from config import config as config, target_rss as target_rss | |
4 | +import ConfigParser | |
5 | +import sys | |
6 | + | |
7 | +def main(): | |
8 | + config = ConfigParser.SafeConfigParser() | |
9 | + for item in target_rss: | |
10 | + config.add_section(item["name"]) | |
11 | + config.set(item["name"], 'source', item["url"]) | |
12 | + config.set(item["name"], 'url', item["source_url"]) | |
13 | + if 'filter' in item: | |
14 | + filters = ",".join(item["filter"]) | |
15 | + config.set(item["name"], 'filters', filters) | |
16 | + config.write(sys.stdout) | |
17 | + | |
18 | +if __name__ == '__main__': | |
19 | + main() | |
20 | + | |
21 | + |
@@ -0,0 +1,27 @@ | ||
1 | +# configloader.py | |
2 | +# -*- config: utf-8 -*- | |
3 | + | |
4 | +import ConfigParser | |
5 | + | |
6 | +CONFIG_FILE = 'sources.ini' | |
7 | + | |
8 | +def load(): | |
9 | + 'parse .ini file and create config object' | |
10 | + config = ConfigParser.SafeConfigParser() | |
11 | + fp = open(CONFIG_FILE, 'r') | |
12 | + config.readfp(fp) | |
13 | + fp.close() | |
14 | + sources = [] | |
15 | + for section in config.sections(): | |
16 | + source = {} | |
17 | + source["name"] = section | |
18 | + source["source"] = config.get(section, 'source') | |
19 | + source["url"] = config.get(section, 'url') | |
20 | + if config.has_option(section, 'filters'): | |
21 | + filters = config.get(section, 'filters').split(',') | |
22 | + filters = [x.strip() for x in filters] | |
23 | + source["filters"] = filters | |
24 | + sources.append(source) | |
25 | + return sources | |
26 | + | |
27 | + |
@@ -10,21 +10,33 @@ a { | ||
10 | 10 | margin-bottom: 1em; |
11 | 11 | } |
12 | 12 | |
13 | +.entry-header .thumbnail { | |
14 | + width: 100px; | |
15 | + float: right; | |
16 | + margin-left: 10px; | |
17 | + margin-bottom: 10px; | |
18 | + pagging: 1px; | |
19 | +} | |
20 | + | |
13 | 21 | .entry-continue { |
14 | 22 | margin-bottom: 1em; |
15 | 23 | } |
16 | 24 | .entry-footer{ |
17 | - color: gray; | |
25 | + color: #888; | |
18 | 26 | } |
19 | 27 | |
20 | 28 | #site-header { |
21 | - border-bottom: 1px solid gray; | |
29 | + border-bottom: 1px solid #888; | |
22 | 30 | margin-bottom: 10px; |
23 | 31 | } |
24 | 32 | |
33 | +#site-header .last-update { | |
34 | + color: #888; | |
35 | +} | |
36 | + | |
25 | 37 | #site-footer { |
26 | 38 | margin-top: 10px; |
27 | - color: gray; | |
39 | + color: #888; | |
28 | 40 | text-align: center; |
29 | 41 | } |
30 | 42 |
@@ -5,7 +5,7 @@ import re | ||
5 | 5 | |
6 | 6 | import feedparser |
7 | 7 | import dateutil.parser |
8 | -from config import config as config, target_rss as target_rss | |
8 | +from config import config as config | |
9 | 9 | from logger import log |
10 | 10 | |
11 | 11 | class FeedFetcher(object): |
@@ -15,13 +15,13 @@ class FeedFetcher(object): | ||
15 | 15 | |
16 | 16 | def _fetch(self): |
17 | 17 | 'do fetch' |
18 | - f = feedparser.parse(self._feed["url"]) | |
18 | + f = feedparser.parse(self._feed["source"]) | |
19 | 19 | entries = [] |
20 | 20 | for e in f['entries']: |
21 | 21 | entry = { |
22 | 22 | # 'title': e.title.decode('utf8') if isinstance(e.title, str) else e.title, |
23 | 23 | 'title': e.title, |
24 | - 'link': e.link, | |
24 | + 'url': e.link, | |
25 | 25 | 'body': e.description, |
26 | 26 | 'date': dateutil.parser.parse(e.updated), |
27 | 27 | 'feed': self._feed, |
@@ -58,8 +58,8 @@ class FeedFetcher(object): | ||
58 | 58 | entries = self._fetch() |
59 | 59 | entries = self._apply_pre_filters(entries) |
60 | 60 | |
61 | - if 'filter' in self._feed: | |
62 | - filters = self._feed.get('filter', None) | |
61 | + if 'filters' in self._feed: | |
62 | + filters = self._feed.get('filters', None) | |
63 | 63 | entries = self._apply_filters(filters, entries) |
64 | 64 | |
65 | 65 | entries = self._apply_post_filters(entries) |
@@ -0,0 +1,15 @@ | ||
1 | +# remove images for tracking | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_rssad_url = re.compile(r'^http://rss.rssad.jp/') | |
7 | + | |
8 | +def entry_filter(entry): | |
9 | + if "images" in entry: | |
10 | + for i in range(len(entry["images"])): | |
11 | + if re_rssad_url.search(entry["images"][i]): | |
12 | + entry["images"].pop(i) | |
13 | + | |
14 | + return entry | |
15 | + |
@@ -1,20 +1,23 @@ | ||
1 | 1 | #!/usr/bin/python |
2 | 2 | 'gnews.py - google news clone' |
3 | 3 | |
4 | -from config import config as config, target_rss as target_rss | |
4 | +from config import config as config | |
5 | 5 | import renderer |
6 | 6 | import fetcher |
7 | 7 | import os.path |
8 | 8 | import urllib |
9 | 9 | from logger import log |
10 | +import configloader | |
11 | + | |
12 | +sources = configloader.load() | |
10 | 13 | |
11 | 14 | def main(): |
12 | 15 | "gnews's main function" |
13 | - # TODO: argv check | |
16 | +# TODO: argv check | |
14 | 17 | |
15 | 18 | # fetch RSS feed |
16 | 19 | entries = [] |
17 | - for feed in target_rss: | |
20 | + for feed in sources: | |
18 | 21 | f = fetcher.FeedFetcher(feed) |
19 | 22 | e = f.get_entries() |
20 | 23 | entries.extend(e) |
@@ -45,8 +48,14 @@ def main(): | ||
45 | 48 | for e in entries: |
46 | 49 | log(e["date"]) |
47 | 50 | |
51 | + call_plugin('pre_render', entries) | |
52 | + | |
48 | 53 | # do rendering |
49 | - params = {'tags':tags, 'page':{}, 'sorted_tags':sorted_tags} | |
54 | + params = { | |
55 | + 'tags':tags, | |
56 | + 'page':{}, | |
57 | + 'sorted_tags':sorted_tags | |
58 | + } | |
50 | 59 | |
51 | 60 | # render index page |
52 | 61 | do_rendering('index', 'index%s.html', entries, params) |
@@ -57,11 +66,43 @@ def main(): | ||
57 | 66 | do_rendering('tags', tag + '%s.html', subentries, params) |
58 | 67 | |
59 | 68 | |
69 | +def call_plugin(function_name, entries): | |
70 | + "call plugin" | |
71 | + for plugin in config['plugins']: | |
72 | + mod = _get_plugin(plugin) | |
73 | + f = mod.__getattribute__(function_name) | |
74 | + f(entries) | |
75 | + | |
76 | +class PluginError(Exception): | |
77 | + def __init__(self, value): | |
78 | + self.value = value | |
79 | + def __str__(self): | |
80 | + return 'plugin "' + self.value + '" is not found.' | |
81 | + | |
82 | +def _get_plugin(plugin_name): | |
83 | + 'load plugin by config settings' | |
84 | + | |
85 | + # fallback when filter isn't defined | |
86 | + if plugin_name is None: | |
87 | + return lambda x:x | |
88 | + | |
89 | + # import module | |
90 | + mods = __import__(config['plugin_directory'], | |
91 | + globals(), | |
92 | + locals(), | |
93 | + [plugin_name,]) | |
94 | + try: | |
95 | + mod = mods.__getattribute__(plugin_name) | |
96 | + except AttributeError: | |
97 | + raise PluginError(plugin_name) | |
98 | + | |
99 | + return mod | |
100 | + | |
60 | 101 | |
61 | 102 | def do_rendering(page_type, filename, entries, params): |
62 | 103 | "rendering page" |
63 | 104 | |
64 | - r = renderer.Renderer() | |
105 | + r = renderer.Renderer(sources) | |
65 | 106 | tmpl = config[page_type]['template'] |
66 | 107 | output_dir = config[page_type]['output_directory'] |
67 | 108 |
@@ -1,6 +1,6 @@ | ||
1 | 1 | 'logger.py - log output utility' |
2 | 2 | |
3 | -from config import config, target_rss | |
3 | +from config import config | |
4 | 4 | |
5 | 5 | def log(*args): |
6 | 6 | "log helper function" |
@@ -0,0 +1,40 @@ | ||
1 | +#!/usr/bin/python | |
2 | +# -*- coding: utf-8 | |
3 | +'plugin for hatena bookmark counter' | |
4 | + | |
5 | +#from __future__ import with_statement | |
6 | + | |
7 | +import xmlrpclib | |
8 | +import datetime | |
9 | +import time | |
10 | +import sys | |
11 | + | |
12 | +# see http://d.hatena.ne.jp/keyword/%a4%cf%a4%c6%a4%ca%a5%d6%a5%c3%a5%af%a5%de%a1%bc%a5%af%b7%ef%bf%f4%bc%e8%c6%c0API?kid=146686 | |
13 | + | |
14 | +urls = [] | |
15 | +counts = {} | |
16 | + | |
17 | +def pre_fetch(): | |
18 | + pass | |
19 | + | |
20 | +def pre_tag_aggregate(entries): | |
21 | + pass | |
22 | + | |
23 | +def pre_render(entries): | |
24 | + counts = [] | |
25 | + for i in range(0, len(entries), 50): | |
26 | + urls = [x['url'] for x in entries[i:i+50]] | |
27 | + c = _get_count(urls) | |
28 | + for j in range(0, len(c)): | |
29 | + entries[i+j]['url'] = c[j] | |
30 | + | |
31 | +def pre_quit(entries): | |
32 | + pass | |
33 | + | |
34 | +def _get_count(urls): | |
35 | + # urls can have max 50 items | |
36 | + uri = "http://b.hatena.ne.jp/xmlrpc" | |
37 | + server = xmlrpclib.ServerProxy(uri) | |
38 | + t = server.bookmark.getCount(*urls) | |
39 | + return t | |
40 | + |
@@ -7,17 +7,17 @@ from mako.lookup import TemplateLookup | ||
7 | 7 | from mako.exceptions import RichTraceback |
8 | 8 | import dateutil.parser |
9 | 9 | |
10 | -from config import config, target_rss | |
10 | +from config import config | |
11 | 11 | from propertizer import propertize |
12 | 12 | from logger import log |
13 | 13 | |
14 | 14 | def date_format(date): |
15 | - #dt = dateutil.parser.parse(date) | |
16 | 15 | return date.strftime('%Y/%m/%d %H:%M') |
17 | 16 | |
18 | 17 | class Renderer(object): |
19 | - def __init__(self): | |
18 | + def __init__(self, sources): | |
20 | 19 | self.template_dir = config['template_directory'] |
20 | + self._sources = sources | |
21 | 21 | |
22 | 22 | def _get_template(self, template_name): |
23 | 23 | 'read template file' |
@@ -38,8 +38,9 @@ class Renderer(object): | ||
38 | 38 | 'entries': entries, |
39 | 39 | 'params': params, |
40 | 40 | 'site': config['site_parameter'], |
41 | - 'targets': target_rss, | |
41 | + 'sources': self._sources, | |
42 | 42 | } |
43 | + kwargs['site']['last_update'] = datetime.datetime.utcnow() | |
43 | 44 | for key in kwargs: |
44 | 45 | d = propertize(kwargs[key]) |
45 | 46 | kwargs[key] = d |
@@ -0,0 +1,54 @@ | ||
1 | +[So-netセキュリティ通信] | |
2 | +url = http://security-t.blog.so-net.ne.jp/ | |
3 | +source = http://security-t.blog.so-net.ne.jp/index.rdf | |
4 | +filters = tagging | |
5 | + | |
6 | +[Impress Watch] | |
7 | +url = http://www.watch.impress.co.jp/ | |
8 | +source = http://rss.rssad.jp/rss/headline/headline.rdf | |
9 | +filters = tagging | |
10 | + | |
11 | +[japan.internet.com] | |
12 | +url = http://japan.internet.com/ | |
13 | +source = http://rss.internetcom.jp/rss/japaninternetcom/index.rdf | |
14 | +filters = tagging | |
15 | + | |
16 | +[SourceForge.JP Magazine] | |
17 | +url = http://sourceforge.jp/magazine/ | |
18 | +source = http://rss.rssad.jp/rss/sourceforge/magazine/rss | |
19 | + | |
20 | +[WIRED.jp] | |
21 | +url = http://wired.jp/ | |
22 | +source = http://rss.rssad.jp/rss/h/wired/feed.rdf | |
23 | +filters = tagging | |
24 | + | |
25 | +[CNET Japan] | |
26 | +url = http://japan.cnet.com/ | |
27 | +source = http://feeds.japan.cnet.com/rss/cnet/all.rdf | |
28 | +filters = tagging | |
29 | + | |
30 | +[TechCrunch Japan] | |
31 | +url = http://jp.techcrunch.com/ | |
32 | +source = http://jp.techcrunch.com/feed/ | |
33 | +filters = tagging | |
34 | + | |
35 | +[ギズモード・ジャパン] | |
36 | +url = http://www.gizmodo.jp/ | |
37 | +source = http://feeds.gizmodo.jp/rss/gizmodo/index.xml | |
38 | +filters = tagging | |
39 | + | |
40 | +[Slashdot Japan] | |
41 | +url = http://slashdot.jp/ | |
42 | +source = http://rss.rssad.jp/rss/slashdot/slashdot.rss | |
43 | +filters = slashdotjp | |
44 | + | |
45 | +[Engadget Japanese] | |
46 | +url = http://japanese.engadget.com/ | |
47 | +source = http://japanese.engadget.com/rss.xml | |
48 | +filters = tagging | |
49 | + | |
50 | +[ITmedia] | |
51 | +url = http://www.itmedia.co.jp/ | |
52 | +source = http://rss.rssad.jp/rss/itmtop/2.0/itmedia_all.xml | |
53 | +filters = tagging,itmedia | |
54 | + |
@@ -25,14 +25,17 @@ s.parentNode.insertBefore(ga, s); | ||
25 | 25 | <div class="container"> |
26 | 26 | |
27 | 27 | <!-- タイトル --> |
28 | - <div class="row"> | |
29 | - <div class="span12"> | |
30 | - <header id="site-header"> | |
28 | + <div class="row" id="site-header"> | |
29 | + <div class="span9"> | |
30 | + <header> | |
31 | 31 | <a href="${site.root}"> |
32 | 32 | <img id="sitelogo" src="${site.img_directory}/themesjp.png" alt="Themes.JP"> α |
33 | 33 | </a> |
34 | 34 | </header> |
35 | 35 | </div> |
36 | + <div class="span3 last-update"> | |
37 | + last update: ${date_format(site.last_update)} | |
38 | + </div> | |
36 | 39 | </div> |
37 | 40 | |
38 | 41 | <!-- コンテンツ本体 --> |
@@ -59,33 +62,32 @@ s.parentNode.insertBefore(ga, s); | ||
59 | 62 | <div class="entry"> |
60 | 63 | <!-- ヘッダ --> |
61 | 64 | <div class="entry-header"> |
65 | + % if 'images' in entry and len(entry.images) > 0: | |
66 | + <img class="thumbnail" src="${entry.images[0]}"> | |
67 | + % endif | |
62 | 68 | <h3> |
63 | - <a href='${entry.link}' target="_blank_">${entry.title}</a> | |
69 | + <a href='${entry.url}' target="_blank_">${entry.title}</a> | |
64 | 70 | </h3> |
65 | 71 | </div> |
66 | 72 | |
67 | 73 | |
68 | 74 | <!-- 本文テキスト --> |
69 | - <div class="entry-body">${entry.body}</div> | |
75 | + <div class="entry-body"> | |
76 | + ${entry.body} | |
77 | + </div> | |
70 | 78 | |
71 | 79 | <!-- フッタ --> |
72 | 80 | <div class="entry-footer"> |
73 | 81 | <div class="entry-continue"> |
74 | - <a href='${entry.link}'>[続きを読む]</a> | |
82 | + <a href='${entry.url}'>[続きを読む]</a> | |
75 | 83 | </div> |
76 | 84 | <div class="information"> |
77 | - <span>情報元:<a href='${entry.feed.source_url}'>${entry.feed.name}</a></span> | |
85 | + <span>情報元:<a href='${entry.feed.url}'>${entry.feed.name}</a></span> | |
78 | 86 | <span>(${date_format(entry.date)})</span> |
79 | 87 | <span>タグ:</span> |
80 | 88 | % for tag in entry.tags: |
81 | 89 | <span>${tag} </span> |
82 | 90 | % endfor |
83 | - % if 'images' in entry: | |
84 | - <span>画像:</span> | |
85 | - % for imgurl in entry.images: | |
86 | - <span><a href="${imgurl}">*</a></span> | |
87 | - % endfor | |
88 | - % endif | |
89 | 91 | </div> |
90 | 92 | </div> |
91 | 93 | </div> |
@@ -115,8 +117,8 @@ s.parentNode.insertBefore(ga, s); | ||
115 | 117 | <div class="feed-provider"> |
116 | 118 | <h3>情報提供サイト:</h3> |
117 | 119 | <ul class="nav nav-pills nav-stacked"> |
118 | - % for item in targets: | |
119 | - <li><a href="${item.source_url}">${item.name}</a></li> | |
120 | + % for item in sources: | |
121 | + <li><a href="${item.source}">${item.name}</a></li> | |
120 | 122 | % endfor |
121 | 123 | </ul> |
122 | 124 | </div> |