読者です 読者をやめる 読者になる 読者になる

はてなダイアリのURL一覧を取得してみる

はてなダイアリの記事一覧を取得し、lxmlを使って日記一覧とタイトルをcsv形式で出力してみるテスト。なんかAPIとかないのか?

import urllib2, re, csv, sys, locale
from lxml import etree

def fetchentries(hatenaid, n):
    url = "http://d.hatena.ne.jp/%s/archive?word=&of=%d" % (hatenaid, n)
    page = urllib2.urlopen(url)

    parser = etree.HTMLParser(recover=True, encoding='euc-jp',
        remove_pis=True, remove_comments=True)
    tree = etree.parse(page, parser)

    urls = {}
    for action, elem in etree.iterwalk(tree):
        if action == 'end' and elem.tag == 'a':
            if re.match(r"http://d.hatena.ne.jp/%s/\d+/\d+" % hatenaid,
                    elem.attrib['href']):
                urls[elem.attrib['href']] = elem.text
    return urls

def fetchurls(hatenaid):
    urls = {}
    n = 0
    while True:
        ret = fetchentries(hatenaid, n)
        if not ret:
            break
        urls.update(ret)
        n += 50
    return urls

if __name__ == '__main__':
    if len(sys.argv) != 2:
        sys.exit("Usage: gethatenad hatenaid")

    hatenaid = sys.argv[1]
    urls = {}
    for url, title in fetchurls(hatenaid).items():
        urls[url] = title

    f = csv.writer(sys.stdout)
    urls = sorted(urls.items())

    enc = locale.getpreferredencoding()
    for url, title in urls:
        f.writerow([url, title.encode(enc)])