通过代理爬取穷游网
1、通过代理池获取代理
proxiespool .py
# 1、代理的存储(本地文件、数据库) 2、代理定期更新 3、代理池的定期清洗 4、代理质量检验
import requests
import redis
import json
import re
# 定义一个函数,用于获取代理
def get_proxies(url,headers):
res = requests.get(url=url,headers=headers)
print(res.text)
return json.loads(res.text)
# 存储代理
def write_proxies_to_redis(proxy):
rds = redis.StrictRedis(host="127.0.0.1",port=6379,db=3)
rds.lpush("proxies",proxy)
# 代理的质量检验
def test_proxies(proxies,headers):
for proxy in proxies["data"]:
try:
requests.get(url="https://www.baidu.com/",headers=headers,proxies=proxy,timeout=3)
write_proxies_to_redis(proxy)
except Exception as e:
print("代理'%s'质量不过关!"%proxy)
print(e)
# 代理的清洗
def clean_proxies(headers,url):
rds = redis.StrictRedis(host="127.0.0.1",port=6379,db=3)
count = rds.llen("proxies")
proxies = rds.lrange("proxies",0,count)
for proxy in proxies["data"]:
try:
requests.get(url="https://www.baidu.com/",headers=headers,proxies=proxy,timeout=3)
except Exception as e:
print("代理'%s'质量不过关!"%proxy)
rds.lrem("proxies",value=proxy)
update_proxies(url=url,headers=headers)
# 定期更新
def update_proxies(url,headers):
rds = redis.StrictRedis(host="127.0.0.1",port=6379,db=3)
if rds.llen("proxies") <= 10:
proxy_dict = get_proxies(url=url,headers=headers)
test_proxies(proxy_dict)
# 封装一个对外的接口,用于提取代理服务器
def get_pool():
rds = redis.StrictRedis(host="127.0.0.1",port=6379,db=3)
count = rds.llen("proxies")
proxies = rds.lrange("proxies", 0, count)
# print(proxies)
ippool = []
for proxy in proxies:
# print(proxy)
ip = re.findall(pattern=r"'IP': '(.+?)'",string=proxy.decode('utf-8'))[0]
ippool.append({"https":ip})
return ippool
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
url = "http://t.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&groupid=0&qty=50&time=1&pro=&city=&port=1&format=json&ss=5&css=&ipport=1&et=1&pi=1&co=1&dt=1&specialTxt=3&specialJson="
# proxy_dict = get_proxies(url=url,headers=headers)
# print(proxy_dict)
# # write_proxies_to_redis(proxy_dict)
# test_proxies(proxies=proxy_dict,headers=headers)
p = get_pool()
print(p)
2、在穷游网爬虫文件中添加代理
qiongyouSpider .py
import requests
import gevent
from gevent import monkey
monkey.patch_all()
from lxml import etree
from time import sleep
import multiprocessing
import re
import redis
import proxiespool
import random
# 定义一个函数请求每个地区的首页
def fetch_areas(url,headers,area):
# 获取代理
ippool = proxiespool.get_pool()
ipproxy = random.choice(ippool)
print(ipproxy)
area_html = requests.get(url=url%(1,area),headers=headers,proxies=ipproxy).text
a_tree = etree.HTML(area_html)
# 提取每个地区总共有多少页
totalPages = int(re.findall(pattern=r"\d+",string=a_tree.xpath("//a[@class='ui_page_item']/text()")[-1])[0])
# print(area)
# print(totalPages)
# 用协程抓取每一个页面
# 定义一个列表,用于管理当前进程中的所有协程
g_list = []
for page in range(1,totalPages+1):
page_url = url%(page,area)
# 创建一个协程
g = gevent.spawn(fetch_pages_per_area,page_url,headers)
g_list.append(g)
gevent.joinall(g_list)
# 定义一个函数,用于请求每个地区的每个页面信息
def fetch_pages_per_area(url,headers):
print("当前正在请求:",url)
ippool = proxiespool.get_pool()
ipproxy = random.choice(ippool)
res = requests.get(url=url,headers=headers,proxies=ipproxy)
sleep(1)
page_tree = etree.HTML(res.text)
travels = page_tree.xpath("//div[@class='items']")
for travel in travels:
item = {}
item["title"] = travel.xpath(".//dd/text()")[0] if travel.xpath(".//dd/text()") else " "
item["start_time"] = travel.xpath(".//dt/text()")[0] if travel.xpath(".//dt/text()") else " "
item["cycle"] = travel.xpath(".//div[@class='day']//text()")[0] if travel.xpath(".//div[@class='day']//text()") else " "
tag = travel.xpath(".//div[starts-with(@class,'tag')]//text()")
if len(tag) != 0:
item["tag"] = " ".join(tag)
else:
item["tag"] = "自由行"
item["plan"] = travel.xpath(".//div[@class='plan']/p/text()")[0] if travel.xpath(".//div[@class='plan']/p/text()") else " "
item["next_url"] = "http:" + travel.xpath(".//a[@class='link']/@href")[0] if travel.xpath(".//a[@class='link']/@href") else " "
print(item)
rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=8)
rds.lpush("plans",item)
if __name__ == '__main__':
# 定义一个列表,规定我们要抓取那些地区
areas = ["中国","欧洲","泰国","韩国","日本","新加坡","南美洲"]
url = "http://plan.qyer.com/search_0_0_0_0_0_0_%d/?keyword=%s"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
# 对于计算密集型的程序一般用进程、线程或者协程都行,对于io密集型的程序一般不用进程
p_list = []
for area in areas:
# 创建进程
p = multiprocessing.Process(target=fetch_areas,args=(url,headers,area))
p.start()
p_list.append(p)
for p in p_list:
p.join()
是不是怎么还是没写存储的代码
这个真不难,有时间再整理