はてなダイアリのURL一覧を取得してみる
はてなダイアリの記事一覧を取得し、lxmlを使って日記一覧とタイトルをcsv形式で出力してみるテスト。なんかAPIとかないのか?
import urllib2, re, csv, sys, locale from lxml import etree def fetchentries(hatenaid, n): url = "http://d.hatena.ne.jp/%s/archive?word=&of=%d" % (hatenaid, n) page = urllib2.urlopen(url) parser = etree.HTMLParser(recover=True, encoding='euc-jp', remove_pis=True, remove_comments=True) tree = etree.parse(page, parser) urls = {} for action, elem in etree.iterwalk(tree): if action == 'end' and elem.tag == 'a': if re.match(r"http://d.hatena.ne.jp/%s/\d+/\d+" % hatenaid, elem.attrib['href']): urls[elem.attrib['href']] = elem.text return urls def fetchurls(hatenaid): urls = {} n = 0 while True: ret = fetchentries(hatenaid, n) if not ret: break urls.update(ret) n += 50 return urls if __name__ == '__main__': if len(sys.argv) != 2: sys.exit("Usage: gethatenad hatenaid") hatenaid = sys.argv[1] urls = {} for url, title in fetchurls(hatenaid).items(): urls[url] = title f = csv.writer(sys.stdout) urls = sorted(urls.items()) enc = locale.getpreferredencoding() for url, title in urls: f.writerow([url, title.encode(enc)])