import csv
import requests
import re
import json
data_list = []
url = "https://top.baidu.com/board?tab=movie"
headers = {
'User-Agent': '自己的UA'
}
response = requests.get(url, headers=headers)
html = response.content.decode("utf-8")
# result=re.findall('<div id="sanRoot" theme="movie" class="wrapper c-font-normal rel"><!--s-data:(.*?)-->(.*?)</div>',html,re.S)
# result=re.findall('<div id="sanRoot" theme="movie" class="wrapper c-font-normal rel"><!--s-data:\{"data":\{"cards":\[{"component":"textImgListVerticalNormal",(.*?)"text":"电影榜","topContent":null,(.*?)\-->(.*?)</div>',html,re.S)
# result=re.findall('<!--s-data:\{"data":\{"cards":\[{"component":"textImgListVerticalNormal","content":(.*?)"text":"电影榜","topContent":null,(.*?)',html,re.S)
# result=re.findall('<!--s-data:\{"data":\{"cards":\[{"component":"textImgListVerticalNormal","content":(.*?),"more":0,(.*?)',html,re.S)
result = re.findall(
'<div class="c-single-text-ellipsis"> (.*?)</div>.*?<div class="intro_1l0wp"> 类型:(.*?) </div><div class="intro_1l0wp"> 演员:(.*?) </div>',
html, re.S)
print(result)
for data in result:
# 定义一个字典用来保存数据的
item = {}
# print(data)
item["moviename"] = data[0]
item["moviestyle"] = data[1]
item["personname"] = data[2]
# print(item)
data_list.append(item)
print(data_list)
with open("movie.csv","w",encoding="utf-8",newline="") as f:
wt = csv.DictWriter(f, ['moviename','moviestyle','personname'])
# 写入表头
wt.writeheader()
wt.writerows(data_list)
爬取https://top.baidu.com/board?tab=movie电影信息
最新推荐文章于 2024-03-20 13:11:09 发布
