Python爬虫-爬取常用IP代理

YOLO大师

于 2021-08-26 21:15:08 发布

阅读量1.3w

点赞数 15

文章标签： python 爬虫

原创文章，禁止任何形式转载！

本文链接：https://blog.csdn.net/shangyanaf/article/details/119940519

版权

引入包

import requests
from bs4 import BeautifulSoup
import re

爬取proxy360

def IPspider1():
    response = requests.get('http://www.proxy360.cn/default.aspx')
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    proxyList = []
    ipss=soup.find_all('div',class_='proxylistitem')
    for ips in ipss:
        ip=ips.find_all('span',class_='tbBottomLine')
        proxy = {'http': ip[0].text.strip() + ':' + ip[1].text.strip(),
                 'https': ip[0].text.strip() + ':' + ip[1].text.strip()}
        proxyList.append(proxy)
    #print(proxyList)
    return proxyList
    #print("本次从Proxy360爬取{}个代理IP".format(len(ipss)))

爬取西刺代理1

def IPspider2(url):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}

    response = requests.get(url,headers=header)
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    iptable=soup.find('table')
    #print(iptable)
    proxyList = []
    ips=iptable.find_all('tr',class_='odd')
    for ip in ips:

        proxy = {'http': ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip(),
                 'https': ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip()}
        url = "http://httpbin.org/ip"

        try:
            response = requests.get(url, proxies=proxy, timeout=5)
            if response.status_code==requests.codes.ok:
                print('http://'+ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip()+"有效的IP地址")
                print(response.encoding.text)
                proxyList.append(proxy)
            else:
                print('http://'+ip.find_all('td')[1].text.strip()+':'+ip.find_all('td')[2].text.strip()+"无效的IP地址")
        except:
            continue
    print(proxyList)
    print("本次从西刺代理爬取{}个代理IP".format(len(ips)))

爬取西刺代理2

def IPspider3(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}

    session = requests.session()
    page = session.get(url, headers=headers)
    soup = BeautifulSoup(page.text, 'lxml')

    proxyList = []
    taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
    for trtag in taglist:
        tdlist = trtag.find_all('td')
        proxy = {'http': tdlist[1].string + ':' + tdlist[2].string,
                 'https': tdlist[1].string + ':' + tdlist[2].string}
        url = "'http://1212.ip138.com/ic.asp'"

        try:
            response = session.get(url, proxies=proxy, timeout=5)
            proxyList.append(proxy)
            if(len(proxyList) == 10):
                break
        except :
            continue
    print(proxyList)
    return proxyList

测试

IPspider1("http://www.proxy360.cn/default.aspx")
IPspider2("http://www.xicidaili.com/")
IPspider3("http://www.xicidaili.com/nn")

YOLO大师

关注

15
点赞
踩
17

收藏

觉得还不错? 一键收藏
打赏
7
评论
Python爬虫-爬取常用IP代理

引入包import requestsfrom bs4 import BeautifulSoupimport re爬取proxy360def IPspider1(): response = requests.get('http://www.proxy360.cn/default.aspx') html = response.text soup = BeautifulSoup(html, 'lxml') proxyList = [] ipss=soup.fin
复制链接

扫一扫