爬取详情页
import gevent
from gevent import monkey
monkey.patch_all()
import requests
import redis
from queue import Queue
import json
def request_detail(poiId_queue,comment_queue):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
g_list = []
for i in range(len(poiId_queue)):
comment_url = "https://www.meituan.com/meishi/api/poi/getMerchantComment?id=%d&offset=%d&pageSize=%d"
totalPage = int(comment_queue[i]/300)
moreComment = comment_queue[i]%300
if totalPage != 0:
for page in range(0,totalPage):
page_url = comment_url%(poiId_queue[i],page*300,300)
print("正在请求:",page_url)
g = gevent.spawn(func,page_url,headers,poiId_queue[i])
g_list.append(g)
if moreComment != 0:
page_url = comment_url%(poiId_queue[i],(page+1)*300,moreComment)
print("正在请求:",page_url)
g = gevent.spawn(func, page_url, headers,poiId_queue[i])
g_list.append(g)
gevent.joinall(g_list)
def func(url,headers,poiId):
res = requests.get(url=url,headers=headers)
try:
c_dict = json.loads(res.text)
c_list = c_dict["data"]["comments"]
for c in c_list:
item = {}
item["poiId"] = poiId
item["user"] = c["userName"]
item["comment"] = c["comment"]
print(item)
print("%s已经存入redis数据库!"%item["user"])
rds.lpush("foodComments",json.dumps(item))
except Exception as e:
print("该商家暂无评论信息!")
if __name__ == '__main__':
rds = redis.StrictRedis(host="*******",port=6379,db=8)
lens = rds.llen("foodlist")
foodlist = rds.lrange("foodlist",0,lens )
food_id_queue = []
comment_queue = []
for food in foodlist:
food_dict = json.loads(food)
food_id_queue.append(food_dict["poiId"])
comment_queue.append(food_dict["comments"])
request_detail(poiId_queue=food_id_queue,comment_queue=comment_queue)
爬取主页面
import requests
import re
import threading
from time import sleep
import redis
from queue import Queue
import json
class CrawlThread(threading.Thread):
def __init__(self,start_url,page_queue,data_queue,name):
super().__init__()
self.start_url = start_url
self.page_queue = page_queue
self.data_queue = data_queue
self.name = name
def run(self):
while True:
if self.page_queue.empty():
break
page = self.page_queue.get()
url = self.start_url + "pn" + str(page) + "/"
print("当前线程为:%s,正在请求页面:%s"%(self.name,url))
res = requests.get(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'})
self.data_queue.put(res.text)
print("当前已经入队了%d个页面"%self.data_queue.qsize())
sleep(1)
class ParseThread(threading.Thread):
def __init__(self,data_queue,name,page_queue):
super().__init__()
self.data_queue = data_queue
self.name = name
def run(self):
while True:
if self.data_queue.empty():
continue
html = self.data_queue.get()
self.parse(html)
if page_queue.empty():
break
def parse(self,html):
pat = re.compile(r'"poiLists":(.+),"comHeader"')
s = pat.findall(html)[0]
jsondict = json.loads(s)
foodlist = jsondict["poiInfos"]
for food in foodlist:
item = {}
item["poiId"] = food["poiId"]
item["title"] = food["title"]
item["avgScore"] = food["avgScore"]
item["avgPrice"] = food["avgPrice"]
item["address"] = food["address"]
rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=13)
print("当前线程为:%s,正在向redis数据库中存入数据:%s"%(self.name,item['title']))
rds.lpush("foodlist",json.dumps(item))
if __name__ == '__main__':
start_url = "https://bj.meituan.com/meishi/"
page_queue = Queue()
for i in range(1,68):
page_queue.put(i)
data_queue = Queue()
crawl_names = ["爬虫1","爬虫2","爬虫3"]
for name in crawl_names:
t = CrawlThread(start_url=start_url,page_queue=page_queue,data_queue=data_queue,name=name)
t.start()
parse_name = ["解析1","解析2","解析3"]
for name in parse_name:
t = ParseThread(data_queue=data_queue,name=name,page_queue=page_queue)
t.start()