尝试使用了一次多进程,但是显示效果其实其实不是很好,不想用就去掉吧
#!/usr/bin/env python
#-*- coding:utf-8 -*-
# file:豆瓣电影.py
# author:ytytyt
# datetime:2021/7/14 16:00
# software: PyCharm
'''
分析:
url:https://movie.douban.com/top250?start=0
每25部电影一页 start+25即翻页
数据位置:ol li(class grid_view)里面div(item),里面有div(pic)存放排名(em),超链接里面有图片
'''
# import module your need
import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
import json,time
import multiprocessing
def getpage(url):
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
try:
res = requests.get(url,headers=headers)
if res.status_code == 200:
res.encoding="utf-8"
return res.text
else:
return None
except RequestException as e:
print(e)
return None
def parsePage(html):
"""解析爬取的内容"""
doc = pq(html)
res = doc(".item")
#拿到电影相关的文本
for data in res.items():
#循环用解析出来的data创建解析器
doc2 = pq(data)
result ={
"index" : doc2('.pic em').text(),
"title" : doc2('.pic a img').attr.alt,
"image" : doc2('.pic a img').attr.src,
"info" : doc2('.info .bd p:first').text().replace(u'\xa0', u' ')
}
yield result
def writeFile(content):
"""写入文件中"""
with open("豆瓣top250.txt","a",encoding='utf-8') as file1:
content = json.dumps(content, ensure_ascii=False) + "\n"
file1.write(content)
def main(start,Lock):
"""主爬虫调度"""
#两个爬虫一起工作
for i in range(5):
url = "https://movie.douban.com/top250?start="+str(start+25*i)
print("正在爬取第{}页...".format((start+25*i)/25+1))
html = getpage(url)
if html:
res = parsePage(html=html)
for item in res:
with Lock:
writeFile(item)
if __name__ == '__main__':
#多进程实现(需用锁)
mutex = multiprocessing.Lock()
multiprocessing.Process(target=main,args=(0,mutex)).start()
multiprocessing.Process(target=main,args=(125,mutex)).start()
print("爬取完成")