Python爬虫爬取豆瓣电影排名数据,直接复制!
import requests
import fake_useragent
from lxml import etree
if __name__ == '__main__':
fp = open('./douban250.txt', 'w', encoding='utf-8')
print('开始爬取')
for i in range(10):
start = 25 * i
url = 'https://movie.douban.com/top250?start={}'.format(start)
head = {
'User-Agent': fake_useragent.UserAgent().random
}
response = requests.get(url, headers=head)
response.encoding = 'utf-8'
tree = etree.HTML(response.text)
lis = tree.xpath('//*[@id="content"]/div/div[1]/ol/li')
print(lis)
for li in lis:
name = li.xpath('.//a/span[1]/text()')[0]
doctor=li.xpath('.//div/div[2]/div[2]/p[1]/text()[1]')[0]
fp.write(name)
fp.write(doctor)
fp.write('\n')
print('爬取结束')
fp.close()