#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
import codecs
import urllib2
import re
from bs4 import BeautifulSoup
def get_html_content(url = None):
return urllib2.urlopen(url).read()
def get_page_content(chapter_link = None):
page_content = get_html_content(chapter_link)
bs = BeautifulSoup(page_content)
content = bs.find_all(attrs = {'class' : 'd_post_content j_d_post_content '})
result = u''
for it in content[1:]:
result += it.get_text()
return result
def main():
serial_url = 'http://tieba.baidu.com/f/good?kw=%E6%8B%A9%E5%A4%A9%E8%AE%B0&ie=utf-8&cid=5&pn='
tieba_url = 'http://tieba.baidu.com/'
only_see_lz = '?see_lz=1'
fout = codecs.open('zetianji.txt', 'w', 'utf-8')
chapter_list = []
for pn in xrange(300, -1, -50):
page_content = get_html_content(serial_url + str(pn))
bs = BeautifulSoup(page_content, 'lxml')
title_list = bs.find_all(attrs = {'class' : 'threadlist_text threadlist_title j_th_tit '})
tmp = []
for title in title_list:
tmp.append([title.a['title'], title.a['href']])
chapter_list += reversed(tmp)
for chapter_name, chapter_num in chapter_list:
if u'【择天记】' in chapter_name:
print chapter_name
print >> fout, get_page_content(tieba_url + chapter_num + only_see_lz)
if __name__ == '__main__':
main()