爬取大众点评的评论
介绍
这个项目是爬取大众点评中,爬取武汉轮渡武汉关码头评论用户的男女和评论时间的代码,代码只能做一个参考,因为我的账号已经被ban了(具体原因可能是我没设置延迟,下面代码中也没有,如果你不会设置,请不要使用),cookie需要自己填写,代理可以找免费的代理,输出格式是csv文件,可以在exsel中另存为xlsx文件
代码
# -*-coding:utf8 -*-
import requests,re,time,csv
re1 = re.compile(r'href="/member/(?P<href>.*?)".*?data-click-name=.*?data-click-title=.*?>.*?</a>.*?data-click-title="文字".*?>(?P<name>.*?)</a>.*?<div class="misc-info clearfix">.*?<span class="time">(?P<time1>.*?)</span>',re.S)
re2 = re.compile(r'<span class="user-groun"><i class="(?P<sex>.*?)"></i>.*?</span>',re.S)
proxies = {
"https": "117.69.233.236:8089"
}
headers = {
"Cookie": "",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Referer": "https://www.dianping.com/shop/l9AxHePS7bRtisgd/review_all/p3",
"Host": "www.dianping.com"
}
f = open("daz.csv" , mode="w" , encoding='utf-8')
csvwriter = csv.writer(f)
for a in range(110):
url = f"https://www.dianping.com/shop/l9AxHePS7bRtisgd/review_all/p{a}"
requests1 = requests.get(url,headers = headers,proxies = proxies)
# csvwriter.writerow([requests1.text])
list1 = re1.finditer(requests1.text)
requests1.close()
for it in list1:
href = it.group("href")
name = it.group("name").strip()
time1 = it.group("time1").strip()
url2 = f"https://www.dianping.com/member/{href}"
requests2 = requests.get(url2,headers = headers,proxies = proxies)
list2 = re2.finditer(requests2.text)
for i in list2:
sex = i.group("sex")
csvwriter.writerow([ href , name , time1 , sex])
print(name)
requests2.close()