import requests
from lxml import etree
import re
from bs4 import BeautifulSoup
import os
import time
import json
#解析数据
def parse(res):
res_html = etree.HTML(res)
items = res_html.xpath('//div[@class="item"]')
datalist = []
#遍历封装数据,并返回
for item in items:
res = {
'index':item.xpath('.//div/em[@class=""]/text()'),
'image':item.xpath('.//img[@width="100"]/@src'),
'title':item.xpath('.//span[@class="title"]/text()'),
'actor':item.xpath('.//p[@class=""]/text()'),
'score':item.xpath('.//span[@class="rating_num"]/text()')
}
datalist.append(res)
return datalist
#写入数据
def wtiteFile(item):
with open('./douban.json','a',encoding='utf-8')as fp:
fp.write(json.dumps(item,ensure_ascii=False))
fp.write('\n')
def main(offset):
#负责调度爬虫函数
url = 'https://movie.douban.com/top250?start='+str(offset)
headers = {
'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 72.0.3626.81 Safari / 537.36 SE 2.X MetaSr 1.0'
}
res = requests.get(url,headers=headers)
if res.status_code == 200:
khtml = parse(res.text)
for item in khtml:
print('正在写入数据',item["title"])
wtiteFile(item)
if __name__ == '__main__':
for i in range(0,10):
main(offset=i*25)
time.sleep(2)
python 爬取豆瓣电影
最新推荐文章于 2024-07-24 23:24:10 发布