Python爬虫-爬取常用IP代理

引入包

import requests
from bs4 import BeautifulSoup
import re

爬取proxy360

def IPspider1():
    response = requests.get('http://www.proxy360.cn/default.aspx')
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    proxyList = []
    ipss=soup.find_all('div',class_='proxylistitem')
    for ips in ipss:
        ip=ips.find_all('span',class_='tbBottomLine')
        proxy = {'http': ip[0].text.strip() + ':' + ip[1].text.strip(),
                 'https': ip[0].text.strip() + ':' + ip[1].text.strip()}
        proxyList.append(proxy)
    #print(proxyList)
    return proxyList
    #print("本次从Proxy360爬取{}个代理IP".format(len(ipss)))

爬取西刺代理1

def IPspider2(url):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}

    response = requests.get(url,headers=header)
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    iptable=soup.find('table')
    #print(iptable)
    proxyList = []
    ips=iptable.find_all('tr',class_='odd')
    for ip in ips:

        proxy = {'http': ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip(),
                 'https': ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip()}
        url = "http://httpbin.org/ip"

        try:
            response = requests.get(url, proxies=proxy, timeout=5)
            if response.status_code==requests.codes.ok:
                print('http://'+ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip()+"有效的IP地址")
                print(response.encoding.text)
                proxyList.append(proxy)
            else:
                print('http://'+ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip()+"无效的IP地址")
        except:
            continue
    print(proxyList)
    print("本次从西刺代理爬取{}个代理IP".format(len(ips)))

爬取西刺代理2

def IPspider3(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}

    session = requests.session()
    page = session.get(url, headers=headers)
    soup = BeautifulSoup(page.text, 'lxml')

    proxyList = []
    taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
    for trtag in taglist:
        tdlist = trtag.find_all('td')
        proxy = {'http': tdlist[1].string + ':' + tdlist[2].string,
                 'https': tdlist[1].string + ':' + tdlist[2].string}
        url = "'http://1212.ip138.com/ic.asp'"

        try:
            response = session.get(url, proxies=proxy, timeout=5)
            proxyList.append(proxy)
            if(len(proxyList) == 10):
                break
        except :
            continue
    print(proxyList)
    return proxyList

测试

IPspider1("http://www.proxy360.cn/default.aspx")
IPspider2("http://www.xicidaili.com/")
IPspider3("http://www.xicidaili.com/nn")
  • 15
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 7
    评论
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

YOLO大师

你的打赏,我的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值