python爬取西刺ip

'''
Mongcontion     存入mongodb
get_html        获取西刺网页
get_ip          抓取ip
GetIp           验证ip是否可用,删除无用的ip,返回一个可用的ip
使用的时候可以直接调用get_main() 这个接口

'''
import requests
from lxml import etree
import pymongo

from multiprocessing import Pool

client = pymongo.MongoClient('localhost', 27017)
db = client.ipSpider
dbip = db.cixi

headers1 ={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',

}


def Mongcontion(*ip_list):
    ip, port, ip_type, timec=ip_list
    global dbip
    dbip.insert({'ip': ip, 'port': port, 'ip_type': ip_type, 'timec': timec})
    return dbip



def get_html(url):
    response = requests.get(url,headers=headers1)
    # print(response.content.decode())
    print(response.status_code,response.url,response.encoding)
    html = response.content.decode()#获取html字符串
    html = etree.HTML(html) #获取element类型的html
    return html

def get_ip(html):
    # htp = html.xpath('//*[@id="ip_list"]/tbody/tr')
    htp = html.xpath('//tr[@class]')
    htp = htp[1:]
    print(len(htp))
    # print(htp)
    for i in htp:
        if i.xpath('./th'):
            print('这个是表头')
            continue
        ip = i.xpath('./td')[1].text
        port = i.xpath('./td[3]')[0].text
        ip_type = i.xpath('./td')[5].text
        timec = i.xpath('./td')[6].text

        yield (ip,port,ip_type,timec)


class GetIP:
    def delete_ip(self,ip):
        dbip.remove({"ip":ip})
        return True

    def judge_ip(self,ip,port):
        http_url = "http://www.baidu.com"
        proxy_url = 'http://{0}:{1}'.format(ip,port)
        try:
            proxy_dict={
                "http":proxy_url,
            }
            response = requests.get(http_url,headers=headers1,proxies=proxy_dict)
        except Exception as e:
            print('invalid ip and port 1')
            self.delete_ip(ip)
            return False
        else:
            code = response.status_code
            if code>=200 and code < 300:
                print('effective ip 200')
                return True
            else:
                print('invalid ip and port 2')
                self.delete_ip(ip)
                return False

    def get_random_ip(self):
        '''从数据库中随机获取一个可用的ip(1)'''
        ips = dbip.aggregate([{'$sample':{'size':30}}])
        # print(type(ips))
        for ip_info in ips:
            # print('ip_info',ip_info)
            ip = ip_info.get('ip')
            port = ip_info.get('port')
            judge_re = self.judge_ip(ip,port)
            if judge_re:
                return "http://{0}:{1}".format(ip,port)
            else:
                return self.get_random_ip()

def get_main():
    p = Pool(5)
    url = 'https://www.xicidaili.com/'
    html = get_html(url)
    ip_list = get_ip(html)
    # ip, port, ip_type, timec = ip_list
    # print()
    for ip, port, ip_type, timec in ip_list:
        p.apply_async(func=Mongcontion,args=(ip, port, ip_type, timec))
    p.close()  #等待所有进程结束,才关闭线程池
    p.join() #主线程等待所有子线程结束后,才关闭进程池

    getip = GetIP()
    result_ip = getip.get_random_ip()
    return result_ip

if __name__ =='__main__':
    ip = get_main()
    print(ip)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值