抓大众点评才抓了几页就被屏蔽,找到如下方法解决。
第一步:获取代理ip
在http://www.xicidaili.com/nn获取代理,命名为proxy_ip.py,代码如下:
# coding:utf-8
import requests
from bs4 import BeautifulSoup
import re
import os.path
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)'
headers = {'User-Agent': user_agent}
def getListProxies():
session = requests.session()
page = session.get("http://www.xicidaili.com/nn", headers=headers)
soup = BeautifulSoup(page.text, 'lxml')
proxyList = []
taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
for trtag in taglist:
tdlist = trtag.find_all('td')
proxy = {'http': 'http://'+tdlist[1].string + ':' + tdlist[2].string}
url = "http://ip.chinaz.com/getip.aspx" #测试IP是否可用
try:
response = session.get(url, proxies=proxy, timeout=5)
proxyList.append(proxy)
if(len(proxyList) == 50): # 获取ip个数
break
except Exception, e:
continue
return proxyList
if __name__ == "__main__":
proxy_list = getListProxies()
for i in proxy_list:
with open("proxy_ip.txt", "w") as fw:
fw.write(i["http"] + "\n")
http://61.135.217.7:80
http://222.182.53.69:8118
http://116.249.222.96:8118
http://122.114.31.177:808
http://222.76.187.20:8118
http://115.46.151.140:8123
http://123.185.131.236:8118
http://112.114.95.43:8118
http://171.37.156.139:8123
http://115.55.158.113:8118
http://112.114.93.73:8118
http://113.221.46.141:8888
http://112.114.94.42:8118
http://180.115.12.214:28471
http://112.114.99.32:8118
第二步:利用代理ip抓取大众点评某个城市的所有美食商铺的评分
# coding:utf-8
import codecs
import json
import time
import re
import urllib2
import random
import requests
from collections import Counter
proxy_ip_list = []
with codecs.open("proxy_ip.txt", "r", "utf-8") as fr:
for line in fr.readlines():
line = line.strip()
proxy_ip_list.append({"http": line})
def proxy_random():
global proxy_ip_list
index = random.randint(0, len(proxy_ip_list) - 1)
return proxy_ip_list[index]
def crawl_page_proxy(url, proxy):
# proxy = {'http': 'http://115.226.11.45:3128'}
url = "http://www.dianping.com/search/category/35/10/p1"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
web_data = requests.get(url, headers=headers, proxies=proxy)
# 正则匹配评分
res = re.findall(r'class\=\"sml\-rank\-stars sml\-str\d+\"', web_data.text)
return res
def run(data_file):
"""
输入文件,格式为:
{"city": "延安", "url":"http://www.dianping.com/search/category/78/10/","max_pages":50,"min_pages":1}
{"city": "太原", "url":"http://www.dianping.com/search/category/35/10/","max_pages":50,"min_pages":1}
"""
with codecs.open("data.txt", "r", "utf-8") as fr:
for line in fr.readlines():
line = line.strip()
data_json = json.loads(line)
city = data_json["city"]
main_url = data_json["url"]
max_page = data_json["max_pages"]
min_page = data_json["min_pages"]
city_dict = {}
city_dict[city] = []
for page in range(min_page, max_page + 1):
url = main_url + "p" + str(page)
print "pages ==== ",city, url
i = 0
while i < 10: # 重试
proxy_ip = proxy_random()
try:
stars_list = crawl_page_proxy(url, proxy_ip)
print proxy_ip, "OK"
break
except:
i += 1
print proxy_ip, "ERROR"
print "\n"
city_dict[city] += stars_list
time.sleep(random.uniform(3,10))
with codecs.open(city + ".txt", "w", "utf-8") as fw:
for city in city_dict:
fw.write(city + "\t" + str(city_dict[city]) + "\n")
time.sleep(30)
run("data.txt")
完成,没有被屏蔽了。