python代理ip抓取大众点评

最新推荐文章于 2024-07-30 15:18:52 发布

零落_World

最新推荐文章于 2024-07-30 15:18:52 发布

阅读量1.8w

点赞数 1

分类专栏： Python

本文链接：https://blog.csdn.net/cdj0311/article/details/78821094

版权

Python 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

抓大众点评才抓了几页就被屏蔽，找到如下方法解决。

第一步：获取代理ip

在http://www.xicidaili.com/nn获取代理，命名为proxy_ip.py，代码如下：

# coding:utf-8
import requests
from bs4 import BeautifulSoup
import re
import os.path

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)'
headers = {'User-Agent': user_agent}

def getListProxies():
    session = requests.session()
    page = session.get("http://www.xicidaili.com/nn", headers=headers)
    soup = BeautifulSoup(page.text, 'lxml')

    proxyList = []
    taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
    for trtag in taglist:
        tdlist = trtag.find_all('td')
        proxy = {'http': 'http://'+tdlist[1].string + ':' + tdlist[2].string}
        url = "http://ip.chinaz.com/getip.aspx"  #测试IP是否可用
        try:
            response = session.get(url, proxies=proxy, timeout=5)
            proxyList.append(proxy)
            if(len(proxyList) == 50):  # 获取ip个数
                break
        except Exception, e:
            continue

    return proxyList

if __name__ == "__main__":
    proxy_list = getListProxies()
    for i in proxy_list:
        with open("proxy_ip.txt", "w") as fw:
            fw.write(i["http"] + "\n")

部分结果如下：

http://61.135.217.7:80
http://222.182.53.69:8118
http://116.249.222.96:8118
http://122.114.31.177:808
http://222.76.187.20:8118
http://115.46.151.140:8123
http://123.185.131.236:8118
http://112.114.95.43:8118
http://171.37.156.139:8123
http://115.55.158.113:8118
http://112.114.93.73:8118
http://113.221.46.141:8888
http://112.114.94.42:8118
http://180.115.12.214:28471
http://112.114.99.32:8118

第二步：利用代理ip抓取大众点评某个城市的所有美食商铺的评分

# coding:utf-8
import codecs
import json
import time
import re
import urllib2
import random
import requests
from collections import Counter


proxy_ip_list = []
with codecs.open("proxy_ip.txt", "r", "utf-8") as fr:
    for line in fr.readlines():
        line = line.strip()
        proxy_ip_list.append({"http": line})

def proxy_random():
    global proxy_ip_list
    index = random.randint(0, len(proxy_ip_list) - 1)
    return proxy_ip_list[index]

def crawl_page_proxy(url, proxy):
    # proxy = {'http': 'http://115.226.11.45:3128'}
    url = "http://www.dianping.com/search/category/35/10/p1"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    web_data = requests.get(url, headers=headers, proxies=proxy)
    # 正则匹配评分
    res = re.findall(r'class\=\"sml\-rank\-stars sml\-str\d+\"', web_data.text)
    return res

def run(data_file):
    """
    输入文件，格式为：
    {"city": "延安", "url":"http://www.dianping.com/search/category/78/10/","max_pages":50,"min_pages":1}
    {"city": "太原", "url":"http://www.dianping.com/search/category/35/10/","max_pages":50,"min_pages":1}
    """
    with codecs.open("data.txt", "r", "utf-8") as fr:
        for line in fr.readlines():
            line = line.strip()
            data_json = json.loads(line)
            city = data_json["city"]
            main_url = data_json["url"]
            max_page = data_json["max_pages"]
            min_page = data_json["min_pages"]
            city_dict = {}
            city_dict[city] = []
            for page in range(min_page, max_page + 1):
                url = main_url + "p" + str(page)
                print "pages ==== ",city, url
                i = 0
                while i < 10:  # 重试
                    proxy_ip = proxy_random()
                    try:
                        stars_list = crawl_page_proxy(url, proxy_ip)
                        print proxy_ip, "OK"
                        break
                    except:
                        i += 1
                        print proxy_ip, "ERROR"
                print "\n"
                city_dict[city] += stars_list
                time.sleep(random.uniform(3,10))
            with codecs.open(city + ".txt", "w", "utf-8") as fw:
                for city in city_dict:
                    fw.write(city + "\t" + str(city_dict[city]) + "\n")
            time.sleep(30)

run("data.txt")

完成，没有被屏蔽了。