get_ip.py 获取ip代理
import requests
from lxml import etree
import urllib
import random
"""
从西刺Ip网站,得到一些可用的代理Ip
"""
url = r'http://www.xicidaili.com/wn'
ip_list=[] # 存放爬取列表
def get_ip_list(url = url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
# 访问成功,response会返回200
# resquests.get()函数获取页面
response = requests.get(url, headers=headers)
# response.content.decode()
# xlml 中的 etree.HTML() 解析得到的网页数据
html_str = response.content.decode()
html = etree.HTML(html_str)
"""
判断网页的设计,得出需要取得ip内容的位置,保存至ip_list
"""
# ip,port tr改变:odd: 1,2,3,4 没有odd:3,5,7,9
# ip1 = html.xpath("//table[@id='ip_list']/tr[@class='odd'][1]/td[2]")
# ip2 = html.xpath("//table[@id='ip_list']/tr[3]/td[2]")
# port tr改变:同上
# port1 = html.xpath("//table[@id='ip_list']/tr[@class='odd'][1]/td[3]")
# potr2 = html.xpath("//table[@id='ip_list']/tr[3]/td[3]")
# 使用xpath helper工具得到xpath
i = 1
while True:
ip = html.xpath("//table[@id='ip_list']/tr[@class='odd'][{}]/td[2]/text()".format(i))
port = html.xpath("//table[@id='ip_list']/tr[@class='odd'][{}]/td[3]/text()".format(i))
if ip:
ip_list.append(ip[0]+':'+port[0])
i += 1
else:
break
i = 3
while True:
ip = html.xpath("//table[@id='ip_list']/tr[{}]/td[2]/text()".format(i))
port = html.xpath("//table[@id='ip_list']/tr[{}]/td[3]/text()".format(i))
if ip:
ip_list.append(ip[0]+':'+port[0])
i += 2
else:
break
# print(ip_list)
"""验证Ip的可用性"""
for ip in ip_list:
try:
proxy_host = "https://" + ip
proxy_temp = {"https": proxy_host}
res = urllib.urlopen(url, proxies=proxy_temp).read()
except Exception as e:
ip_list.remove(ip)
continue
# print(ip_list)
return ip_list
"""
随机从代理池中取ip
"""
def get_ip():
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies
"""测试"""
if __name__ == '__main__':
url = 'http://www.xicidaili.com/wn/'
ip_list = get_ip_list(url)
proxies = get_ip()
print(proxies)
my_parser.py 解析函数
import requests
from bs4 import BeautifulSoup
import lxml
"""解析函数,返回数据"""
def parser(url, headers , proxies, timeout = 2):
info = {} #保存数据
res = requests.get(url, headers=headers, proxies=proxies, timeout=timeout)
# res = requests.get(url,headers=headers)
# print(res.status_code)
# 获取数据
if res.status_code == 200:
soup = BeautifulSoup(res.text, 'lxml')
# codl.class 下 dd dt标签的文本
key = [b.text for b in soup.select('.codl dd')]
value = [p.text for p in soup.select('.codl dt')]
# 存入 info字典
for k, v in zip(value, key):
info[k.strip(':')] = v
return info
shunqi_spider.py 主程序
import urllib.request
import requests
from bs4 import BeautifulSoup
import random
import pymongo
from multiprocessing.dummy import Pool as ThreadPool
import get_ip
import my_parser
# 'User-Agent'
user_agent_list=[
'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0',
'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
'Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)',
'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)',
'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)',
'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)',
'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;AvantBrowser)',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
]
# 定义数据库
client = pymongo.MongoClient('localhost',27017)
conpany_info = client['conpany_info'] # 给数据库命名
sheet_table = conpany_info['sheet_table'] # 创建表单
def get_data(url):
# 每个网页尝试次数
for i in range(10):
try:
# 解析函数
proxies = get_ip.get_ip()
headers = {
'User-Agent': random.choice(user_agent_list)
}
# 等待可能网页波动造成的延迟 1 秒
info = my_parser.parser(url=url, headers=headers, proxies=proxies, timeout=1)
if info:
# 存储到数据库
sheet_table.insert(info)
# infos[url] = info
print("数据捕获完成")
break
except Exception as e:
print(e, url)
if __name__ == "__main__":
# infos = {} # 要得到的信息
# 待爬取的网页,range(2,160998))
urls = ('https://m.11467.com/jinan/co/{}.htm'.format(str(i)) for i in range(2,30))
# 代理池页面
ip_url = 'http://www.xicidaili.com/wn/'
ip_list = get_ip.get_ip_list(ip_url) # 得到ip代理列表
# # 存储数据
# path = r'D:\code\shunqi_spider\shunqi.txt'
# with open(path,'w') as file:
# for key,value in infos.items():
# file.write(key + ' : ' + str(value) + '\n')
# print("程序运行结束")
# 存储数据到数据库
# 创建线程池
pool = ThreadPool(20)
results = pool.map(get_data,urls)
pool.close()
pool.join()
总结:
1.xpath-helper 解析数据位置
2.通过多个User-Agent和多个ip的随机访问,解决反反爬
3.设置等待时间
4.解析网页的几种方式
5.mongoDB数据库
6.线程池
7.random.choise 随机数
8.进修cookie session js