通过代理爬取穷游网

1、通过代理池获取代理

proxiespool .py

# 1、代理的存储(本地文件、数据库) 2、代理定期更新  3、代理池的定期清洗  4、代理质量检验
import requests
import redis
import json
import re

# 定义一个函数,用于获取代理
def get_proxies(url,headers):
    res = requests.get(url=url,headers=headers)
    print(res.text)
    return json.loads(res.text)
# 存储代理
def write_proxies_to_redis(proxy):
    rds = redis.StrictRedis(host="127.0.0.1",port=6379,db=3)
    rds.lpush("proxies",proxy)

# 代理的质量检验
def test_proxies(proxies,headers):
    for proxy in proxies["data"]:
        try:
            requests.get(url="https://www.baidu.com/",headers=headers,proxies=proxy,timeout=3)
            write_proxies_to_redis(proxy)
        except Exception as e:
            print("代理'%s'质量不过关!"%proxy)
            print(e)

# 代理的清洗
def clean_proxies(headers,url):
    rds = redis.StrictRedis(host="127.0.0.1",port=6379,db=3)
    count = rds.llen("proxies")
    proxies = rds.lrange("proxies",0,count)
    for proxy in proxies["data"]:
        try:
            requests.get(url="https://www.baidu.com/",headers=headers,proxies=proxy,timeout=3)

        except Exception as e:
            print("代理'%s'质量不过关!"%proxy)
            rds.lrem("proxies",value=proxy)
            update_proxies(url=url,headers=headers)

# 定期更新
def update_proxies(url,headers):
    rds = redis.StrictRedis(host="127.0.0.1",port=6379,db=3)
    if rds.llen("proxies") <= 10:
        proxy_dict = get_proxies(url=url,headers=headers)
        test_proxies(proxy_dict)

# 封装一个对外的接口,用于提取代理服务器
def get_pool():
    rds = redis.StrictRedis(host="127.0.0.1",port=6379,db=3)
    count = rds.llen("proxies")
    proxies = rds.lrange("proxies", 0, count)
    # print(proxies)
    ippool = []
    for proxy in proxies:
        # print(proxy)
        ip = re.findall(pattern=r"'IP': '(.+?)'",string=proxy.decode('utf-8'))[0]
        ippool.append({"https":ip})
    return ippool



if __name__ == '__main__':
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    url = "http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&groupid=0&qty=50&time=1&pro=&city=&port=1&format=json&ss=5&css=&ipport=1&et=1&pi=1&co=1&dt=1&specialTxt=3&specialJson="
    # proxy_dict = get_proxies(url=url,headers=headers)
    # print(proxy_dict)
    # # write_proxies_to_redis(proxy_dict)
    # test_proxies(proxies=proxy_dict,headers=headers)
    p = get_pool()
    print(p)


2、在穷游网爬虫文件中添加代理

qiongyouSpider .py

import requests
import gevent
from gevent import monkey
monkey.patch_all()
from lxml import etree
from time import sleep
import multiprocessing
import re
import redis
import proxiespool
import random


# 定义一个函数请求每个地区的首页
def fetch_areas(url,headers,area):
    # 获取代理
    ippool = proxiespool.get_pool()
    ipproxy = random.choice(ippool)
    print(ipproxy)
    area_html = requests.get(url=url%(1,area),headers=headers,proxies=ipproxy).text
    a_tree = etree.HTML(area_html)
    # 提取每个地区总共有多少页
    totalPages = int(re.findall(pattern=r"\d+",string=a_tree.xpath("//a[@class='ui_page_item']/text()")[-1])[0])
    # print(area)
    # print(totalPages)
    # 用协程抓取每一个页面
    # 定义一个列表,用于管理当前进程中的所有协程
    g_list = []
    for page in range(1,totalPages+1):
        page_url = url%(page,area)
        # 创建一个协程
        g = gevent.spawn(fetch_pages_per_area,page_url,headers)
        g_list.append(g)
    gevent.joinall(g_list)

# 定义一个函数,用于请求每个地区的每个页面信息
def fetch_pages_per_area(url,headers):
    print("当前正在请求:",url)
    ippool = proxiespool.get_pool()
    ipproxy = random.choice(ippool)
    res = requests.get(url=url,headers=headers,proxies=ipproxy)
    sleep(1)
    page_tree = etree.HTML(res.text)
    travels = page_tree.xpath("//div[@class='items']")
    for travel in travels:
        item = {}
        item["title"] = travel.xpath(".//dd/text()")[0] if travel.xpath(".//dd/text()") else " "
        item["start_time"] = travel.xpath(".//dt/text()")[0] if travel.xpath(".//dt/text()") else " "
        item["cycle"] = travel.xpath(".//div[@class='day']//text()")[0] if travel.xpath(".//div[@class='day']//text()") else " "
        tag = travel.xpath(".//div[starts-with(@class,'tag')]//text()")
        if len(tag) != 0:
            item["tag"] = " ".join(tag)
        else:
            item["tag"] = "自由行"
        item["plan"] = travel.xpath(".//div[@class='plan']/p/text()")[0] if travel.xpath(".//div[@class='plan']/p/text()") else " "
        item["next_url"] = "http:" + travel.xpath(".//a[@class='link']/@href")[0] if travel.xpath(".//a[@class='link']/@href") else " "
        print(item)
        rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=8)
        rds.lpush("plans",item)






if __name__ == '__main__':
    # 定义一个列表,规定我们要抓取那些地区
    areas = ["中国","欧洲","泰国","韩国","日本","新加坡","南美洲"]
    url = "http://plan.qyer.com/search_0_0_0_0_0_0_%d/?keyword=%s"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}

    # 对于计算密集型的程序一般用进程、线程或者协程都行,对于io密集型的程序一般不用进程
    p_list = []
    for area in areas:
        # 创建进程
        p = multiprocessing.Process(target=fetch_areas,args=(url,headers,area))
        p.start()
        p_list.append(p)

    for p in p_list:
        p.join()

是不是怎么还是没写存储的代码

这个真不难,有时间再整理

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值