静态网页,爬时光网,加翻页功能
'''
Created on 2015-9-28
'''
from lxml import html
from time import sleep
#the name of Male star
names_xpath = "//strong[@class='px14']/a/text()"
#Introduction
introductions_xpath = "//dd[@class='iinfo']/p[@class='mt6 c_666']/text()"
#the next button
next_button_xpath = "//a[@id='key_nextpage']/@href"
names = []
introductions = []
base_url = 'http://movie.mtime.com/list/{}'
next_page = "http://movie.mtime.com/list/250.html"
while len(names) < 50 and next_page:
print "Retrieved names from url: {}" .format(next_page)
dom = html.parse(next_page)
names += dom.xpath(names_xpath)
introductions += dom.xpath(introductions_xpath)
next_pages = dom.xpath(next_button_xpath)
if next_pages:
next_page = base_url.format(next_pages[0])
else:
print "No next button found"
next_page = None
sleep(3)
i = 0
with open('information.txt', 'wb') as out:
while i < len(names) and i < len(introductions) :
out.write(names[i].encode('utf-8'))
out.write(introductions[i].encode('utf-8'))
out.write('\n'.encode('utf-8'))
i += 1
print "WRITE DONE"
with open('information.txt') as file:
informations = file.readlines()
print "Well, we got {} Male Star!".format(len(informations))
for information in informations:
print information