'''
Mongcontion 存入mongodb
get_html 获取西刺网页
get_ip 抓取ip
GetIp 验证ip是否可用,删除无用的ip,返回一个可用的ip
使用的时候可以直接调用get_main() 这个接口
'''
import requests
from lxml import etree
import pymongo
from multiprocessing import Pool
client = pymongo.MongoClient('localhost', 27017)
db = client.ipSpider
dbip = db.cixi
headers1 ={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
}
def Mongcontion(*ip_list):
ip, port, ip_type, timec=ip_list
global dbip
dbip.insert({'ip': ip, 'port': port, 'ip_type': ip_type, 'timec': timec})
return dbip
def get_html(url):
response = requests.get(url,headers=headers1)
# print(response.content.decode())
print(response.status_code,response.url,response.encoding)
html = response.content.decode()#获取html字符串
html = etree.HTML(html) #获取element类型的html
return html
def get_ip(html):
# htp = html.xpath('//*[@id="ip_list"]/tbody/tr')
htp = html.xpath('//tr[@class]')
htp = htp[1:]
print(len(htp))
# print(htp)
for i in htp:
if i.xpath('./th'):
print('这个是表头')
continue
ip = i.xpath('./td')[1].text
port = i.xpath('./td[3]')[0].text
ip_type = i.xpath('./td')[5].text
timec = i.xpath('./td')[6].text
yield (ip,port,ip_type,timec)
class GetIP:
def delete_ip(self,ip):
dbip.remove({"ip":ip})
return True
def judge_ip(self,ip,port):
http_url = "http://www.baidu.com"
proxy_url = 'http://{0}:{1}'.format(ip,port)
try:
proxy_dict={
"http":proxy_url,
}
response = requests.get(http_url,headers=headers1,proxies=proxy_dict)
except Exception as e:
print('invalid ip and port 1')
self.delete_ip(ip)
return False
else:
code = response.status_code
if code>=200 and code < 300:
print('effective ip 200')
return True
else:
print('invalid ip and port 2')
self.delete_ip(ip)
return False
def get_random_ip(self):
'''从数据库中随机获取一个可用的ip(1)'''
ips = dbip.aggregate([{'$sample':{'size':30}}])
# print(type(ips))
for ip_info in ips:
# print('ip_info',ip_info)
ip = ip_info.get('ip')
port = ip_info.get('port')
judge_re = self.judge_ip(ip,port)
if judge_re:
return "http://{0}:{1}".format(ip,port)
else:
return self.get_random_ip()
def get_main():
p = Pool(5)
url = 'https://www.xicidaili.com/'
html = get_html(url)
ip_list = get_ip(html)
# ip, port, ip_type, timec = ip_list
# print()
for ip, port, ip_type, timec in ip_list:
p.apply_async(func=Mongcontion,args=(ip, port, ip_type, timec))
p.close() #等待所有进程结束,才关闭线程池
p.join() #主线程等待所有子线程结束后,才关闭进程池
getip = GetIP()
result_ip = getip.get_random_ip()
return result_ip
if __name__ =='__main__':
ip = get_main()
print(ip)