import requests
import lxml.etree as le
import pandas as pd
headers = {
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36’}
url = “https://movie.douban.com/top250”
res = requests.get(url, headers=headers).text
print(res)
html_x = le.HTML(res)
titles = html_x.xpath("//div[@class=‘hd’]/a/span[1]/text()")
links = html_x.xpath("//div[@class=‘hd’]/a/@href")
def format_str(str):
# return str.replace(’\n’, ‘’).replace(’ ', ‘’)
list_data = pd.DataFrame(columns=[“序号”, “电影名称”, “豆瓣链接”])
for data in range(len(titles)):
temp = {}
title = titles[data]
link = links[data]
# print(list_data)
temp["序号"] = data + 1
temp["电影名称"] = title
temp["豆瓣链接"] = link
list_data = list_data.append(temp, ignore_index=True)
for row in list_data.iterrows():
# row数据类型是个元组
print(row[1])
print(f"已爬取总数目、为:{len(list_data)}")
list_data.to_csv(“doubantop25.csv”, index=False, encoding=‘utf_8_sig’)