python爬取66IP代理

最新推荐文章于 2022-09-20 11:27:30 发布

weixin_43554366

最新推荐文章于 2022-09-20 11:27:30 发布

阅读量531

点赞数

本文链接：https://blog.csdn.net/weixin_43554366/article/details/101478433

版权

config.py

#用于找出所需目标的位置,和方法
config_list = [{
     'urls': [f'http://www.66ip.cn/{i}.html' for i in range(1,6)],#网址(多个)
     'type': 'Xpath',  #寻找方法
     'pattern': '//div[@align="center"]//table//tr/td/text()', #xpath路径用于找那些代理Ip有关信息位置
     'target': {'ip':'td[1]','port':'td[2]'}
                 }  #找到代理的ip和端口
]

crawler.py

from Download import Downloader  #Download文件中的类导入
from config import config_list #config文件中的字典导入
from myparser import Parser  #parser文件中的类导入

#主框架，其它是子项用于拼凑数据给它
class Crawler():
    def crawler(self):
        for confi in config_list:
            for url in confi['urls']:
                resp = Downloader().download(url,confi) #实例化
                 #调用Download中方法得到网页源码
                if resp:
                    result = Parser().parse(resp,confi) #引用parser文件中的
                    print(result)

if __name__ == '__main__':
    s = Crawler()
    s.crawler()

Download.py

import requests
import chardet
import traceback
import time

#下载网页的代码
class Downloader(object):
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
        }


    def download(self,url,confi):
        print(f'{url}正在下载中....')
        try:
            resp = requests.get(url,headers = self.headers)
            resp.encoding = chardet.detect(resp.content).get('encoding') #chardet方法用于看文本的格式是社么

            if confi.get('delay'):
                time.sleep(confi.get('delay')) #延迟一秒，有的网站点击过快会报错,用get方法如果无delay返回none不是报错

            if resp.status_code == 200:
                return resp.text
            else:
                raise ConnectionError()

        except Exception:
            print(f'下载{url}发生异常')
            traceback.print_exc() #自动写出异常

myparser.py

from lxml import etree
import traceback
import re

class Parser(object):
    #@staticmethod
    def xpath_parse(self,resp,parse_ruler):# 得到Download文件中返回的网页源码，进行分析得到想要的数据，方法由config文件提供
        try:

            page = etree.HTML(resp) #形成xpath可分析的格式
            proxies = page.xpath(parse_ruler["pattern"]) #得到网页目标代码
            proxy_list = []
            for i in proxies[1:]:
                ip = i.xpath(parse_ruler["target"]["ip"])[0].text  #得到目标代码的代理ip,提取字符串里的文本
                port = i.xpath(parse_ruler["target"]["port"])[0].text#得到目标代码的代理port(端口)
                proxy = {'proxyip':ip + ':' +port } #对ip和port进行拼接
                proxy_list.append(proxy)
            return proxy_list  #返回所有代理
        except Exception:
             traceback.print_exc( )

    #@staticmethod
    def re_parse(self,resp,parse_ruler):
        try:
            proxy_list = []
            proxies = re.findall(parse_ruler["pattern"], resp, re.S)
            #print(proxies)
            for i in proxies[1:]:
                ip = re.findall(parse_ruler["target"]["ip"],i,re.S)[0] #取出的是列表所以[0]来
                port = re.findall(parse_ruler["target"]["port"], i, re.S)[0]
                proxy = {'proxyip': ip + ':' + port}  # 对ip和port进行拼接
                proxy_list.append(proxy)

            return proxy_list  # 返回所有代理

        except Exception:
            traceback.print_exc()


    def parse(self,resp,parse_ruler):
        if parse_ruler["type"] == 'xpath':
            return self.xpath_parse(resp,parse_ruler)
        elif parse_ruler["type"] == 're':
            return self.re_parse(resp,parse_ruler)