Xpath爬取豆瓣电影排行榜250并写入文件
from lxml import etree
import requests
import os
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_urls ():
list=[]
for i in range(0, 226, 25):
url = 'https://movie.douban.com/top250?start=' + str(i) + '&filter='
list.append(url)
return list
def get_data(url, headers):
wb_data = requests.get(url, headers).text
html = etree.HTML(wb_data)
datas = html.xpath('//*[@id="content"]/div/div/ol/li/div/div/div/a/span/text()| //*[@id="content"]/div/div/ol/li/div/div/div/p/text()[1] | //*[@id="content"]/div/div/ol/li/div/div/div/p/text()[2] | //*[@id="content"]/div/div/ol/li/div/div/div/div/span/text()')
with open ('C:/Users/92590/Desktop/电影.txt', 'a',encoding='utf-8' ) as f:
for data in datas:
print (data.strip())
f.write (data.strip())
f.write("\n")
list = get_urls()
for i in range(len(list)):
get_data(list[i], headers)