用Python从贴吧扒连载小说


#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

import codecs
import urllib2
import re
from bs4 import BeautifulSoup

def get_html_content(url = None):
    return urllib2.urlopen(url).read()

def get_page_content(chapter_link = None):
    page_content = get_html_content(chapter_link)
    bs = BeautifulSoup(page_content)
    content = bs.find_all(attrs = {'class' : 'd_post_content j_d_post_content '})
    result = u''
    for it in content[1:]:
        result += it.get_text()

    return result

def main():
    serial_url = 'http://tieba.baidu.com/f/good?kw=%E6%8B%A9%E5%A4%A9%E8%AE%B0&ie=utf-8&cid=5&pn='
    tieba_url = 'http://tieba.baidu.com/'
    only_see_lz = '?see_lz=1'

    fout = codecs.open('zetianji.txt', 'w', 'utf-8')
    chapter_list = []
    for pn in xrange(300, -1, -50):
        page_content = get_html_content(serial_url + str(pn))
        bs = BeautifulSoup(page_content, 'lxml')
        title_list = bs.find_all(attrs = {'class' : 'threadlist_text threadlist_title j_th_tit  '})
        tmp = []
        for title in title_list:
            tmp.append([title.a['title'], title.a['href']])

        chapter_list += reversed(tmp)

    for chapter_name, chapter_num in chapter_list:
        if u'【择天记】' in chapter_name:
            print chapter_name
            print >> fout, get_page_content(tieba_url + chapter_num + only_see_lz)

if __name__ == '__main__':
    main()