python大作业

最新推荐文章于 2024-06-29 16:23:10 发布

weixin_30394981

最新推荐文章于 2024-06-29 16:23:10 发布

阅读量6.6k

点赞数 3

文章标签： python 操作系统

原文链接：http://www.cnblogs.com/destinymingyun/p/10786742.html

版权

爬取西刺代理

生成请求头

#encoding = utf-8;
__all__ = ("Header");
import random;

class Header(object):
    '''请求头构造类'''
    def __init__(self):
        self.__user_agent = [
            "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)", #IE
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",  #  Fire_Fox
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; zh-CN) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",  # Chrome
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",  # taobao
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",    #猎豹
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", # 360
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", # safarir
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", # 搜狐
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ", # maxthon
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36" # uc
        ];

    @property
    def headers(self):
        '''返回一个伪造后的hander'''
        headers = {
            "User-agent" : self.user_agent,
        };
        return headers;

    @property
    def user_agent(self):
        index = random.randint(0, len(self.__user_agent)-1);
        return self.__user_agent[index];

    def __new__(cls):
        '''此类创建模式为单实例模式'''
        if not hasattr(cls, "__instance"):
            cls.__instance = super().__new__(cls);
            return cls.__instance;
        else:
            return cls.__instance;

ip 模块类

#encoding = utf-8;
__all__ = ("IP_Model", "IP_List");

class IP_Model(object):
    '''保存代理ip的全部内容'''
    def __init__(self):
        self._country = None;
        self._addres = None;

    @property
    def country(self):
        '''
        代理服务器所在国家
        '''
        return self._country;

    @country.setter
    def country(self, ip_country):
        if ip_country != None:
            self._country = ip_country;
        else:
            self._country = None;

    @property
    def ip(self):
        '''
        代理服务器的ip
        '''
        return self._ip;

    @ip.setter
    def ip(self, new_ip):
        self._ip = new_ip;

    @property
    def port(self):
        '''
        访问端口号
        '''
        return self._port;

    @port.setter
    def port(self, new_port):
        self._port = new_port;

    @property
    def addres(self):
        '''
        服务器所在省地址
        '''
        return self._addres;

    @addres.setter
    def addres(self, new_addres):
        if new_addres != None:
            self._addres = new_addres;
        else:
            self._addres = None;

    @property
    def http_type(self):
        '''
        请求类型
        '''
        return self._http_type;

    @http_type.setter
    def http_type(self, type):
        self._http_type = type;

    @property
    def velocity(self):
        '''服务器速度'''
        return self._velocity;

    @velocity.setter
    def velocity(self, http_velocity):
        self._velocity = http_velocity;

    @property
    def anonymous(self):
        return self._anonymous;

    @anonymous.setter
    def anonymous(self, anonymous_text):
        if anonymous_text == "高匿":
            self._anonymous = True;
        else:
            self._anonymous= False;

    def __str__(self):
        '''
        重新__str__方法，
        :return: 返回格式化的IP_Model属性内容生成的字符串
        '''
        return (
            "| country: {} |\n"
            "| ip: {} |\n"
            "| port: {} |\n"
            "| address: {} |\n"
            "| http_type: {} |\n"
            "| velocity: {}|\n"
                .format(self.country, self.ip, self.port, self.addres, self.http_type, self.velocity)
        );

    def to_dict(self):
        return {
            "country" : self.country,
            "ip" : self.ip,
            "port" : self.port,
            "addres" : self.addres,
            "http_type" : self.http_type,
            "velocity" : self.velocity
        };

    def from_dict(self,dict):
        self.country = dict.get("country");
        self.ip = dict.get("ip");
        self.port = dict("port");
        self.addres = dict.get("addres");
        self.http_type = dict.get("http_type");
        self.velocity = dict.get("velocity");

    def get_ip_proxies(self):
        proxies = None;
        if self.http_type == "https":
            proxies = { "https" : "{}:{}".format(self.ip, self.port)};
        else:
            proxies = {"http": "{}:{}".format(self.ip, self.port)};
        return proxies;

class IP_List(object):
    def __init__(self):
        self.http_list = None;
        self.https_list = None;

　保存到csv

#encoding = utf-8
import pandas;

'''
    供simple_proxy使用的保存数据函数集
'''

def to_pandas_DataFrame(ips_list):
    '''
    适配pandas 的数据类型， 将list表转换为pandas存储的数据类型
    :param page_list:
    :return: 返回panfas存储数据的类型
    '''
    page_map = map(lambda ip_model: ip_model.to_dict(), ips_list);
    return pandas.DataFrame(list(page_map));

def to_csv(dicts):
    to_pandas_DataFrame(dicts).to_csv("./ips_info.csv", mode="a", encoding="ANSI");

def read_csv(path, start, step):
    '''
    从csv的指定行开始读取对应行数的ip内容
    :param path: csv文件路径名
    :param start: 开始行
    :param step: 每次读取的行数
    :return: 返回对应的ip_list
    '''
    pass;

爬取西刺主体代码

# encoding = utf-8
__all__ = ("html_to_dom", "ProxyIPWorm");
import requests;
from header import Header;
from bs4 import BeautifulSoup;
from ip_model import IP_Model, IP_List;
import save;
import time;
import re;


def simple_proxy(read_out):
    '''
    简单代理ip构建
    :param read_out:
    :return:
    '''
    pass;

def html_to_dom(url, header, proxies=None):
    '''
    简单封装下requests
    :param url: 访问url
    :param header: 伪造的请求头
    :param proxies: 是否使用代理ip
    :return:
    '''
    if proxies != None:
        response = requests.get(url, headers=header, proxies=proxies, verify=True);
    else:
        response = requests.get(url, headers=header, verify=True);
    if response.status_code == 200:
        response.encoding = "utf-8";
        return BeautifulSoup(response.text, "html.parser");
    else:
        return None;

def proxy(url, ips, log):
    '''
    使用代理ip访问指定服务器
    :param url: 访问的服务器ip路径
    :param ips: 携带http_list和https_list的服务器ip列表
    :param log: 是否开启日志
    :return: 返回生成的bs4的dom
    '''
    type = re.match(r"(.*):.*", url).group(1);
    if ips == None:
        raise RuntimeError("代理列表为空");
    ip_list = None;
    if type == "http":
        ip_list = ips.http_list;
    elif type == "https":
        ip_list = ips.https_list;
    else:
        raise RuntimeError("不支持此类请求");
    if log == True:
        print("请求类型{}\n".format(type));
    for ip in ip_list:
        proxies = {type : "{}:{}".format(ip.ip, ip.port)};
        print(proxies);
        dom = html_to_dom(url, Header().headers, proxies);
        if log == True:
            print("当前ip:\n{}\n".format(ip));
        if dom != None:
            return dom;

class ProxyIPWorm(object):
    '''爬取代理ip'''
    def __init__(self):
        self.proxy_ip_html = "https://www.xicidaili.com/nn/";
        self.dom_tree = html_to_dom(self.proxy_ip_html, Header().headers);

    @property
    def start_page(self):
        '''
        开始页
        :return:永远返回1
        '''
        return 1;

    @property
    def end_page(self):
        '''
        获取公开的高匿ip的总页数
        :return: 返回高匿ip页数
        '''
        page_dom = self.dom_tree.select(".pagination a");
        self._end_page = page_dom[-2];
        return int(self._end_page.text);

    def page_url(self, type, page):
        '''
        由给定整数生成对应西刺ip对应的页数的网址
        :param page: 指定的页数
        :return: 生成后的网址
        '''
        if page < 1 or page > self.end_page:
            raise RuntimeError("页数大于总页数");
        elif page == 1:
            return "https://www.xicidaili.com/{}/".format(self.http_type(type));
        else:
            return "https://www.xicidaili.com/{}/{}".format(self.http_type(type) ,page);

    def http_type(self, type):
        '''
        根据http或https返回对应的西刺代理格式
        :param type: hhtp 或 https
        :return: 对应的西刺代理格式
        '''
        if type == "http":
            return "wt";
        elif type == "https":
            return "wn";
        else:
            raise RuntimeError("type应该为http或https");

    def get_page_ips(self, type, page):
        '''
        获取指定页的所有ip
        :param type: ip类型  http 或 https
        :param page: 爬取页面
        :return:返回该页被ip_model封装的所有ip列表
        '''
        print(self.page_url(type ,page));
        page_dom = html_to_dom(self.page_url(type ,page), Header().headers);
        page_ips_dom = page_dom.select("table tr");
        # print(page_ips_dom[0]);
        ip_generator = (ip for ip in page_ips_dom[1:]);
        ip_list = [];
        for ip_dom in ip_generator:
            ip_info = self.get_ip_info(ip_dom);
            ip_list.append(ip_info);
        return ip_list;

    def get_ip_info(self, ip_dom):
        '''
        获取指定的ip详细信息
        :param ip_dom: 存有ip信息的html节点
        :return: 返回ip_model结构的ipo封装类
        '''
        ip_info = IP_Model();
        ip_td = ip_dom.select("td");
        country = ip_td[0].img;

        ip_info.http_type = ip_td[5].text;
        if country != None:
            ip_info.country = str(country.get("alt"));
            ip_info.addres = ip_td[3].text.split()[0];
        ip_info.ip = ip_td[1].text;
        ip_info.port = ip_td[2].text;
        ip_info.anonymous = ip_td[4].text;

        ip_info.velocity = ip_td[6].div.get("title");
        return ip_info;

    def get_pages_ips(self, type, start_page, end_page, save_in=save.to_csv):
        '''
        获取指定开始页到结束页的所有ip(包括结束页)
        :param type: 请求为http还是https
        :param start_page: 开始页面
        :param end_page: 结束页
        :param save_in: 如何保存到文件格式，是一个回调函数，默认保存入csv
        :return:
        '''
        if start_page >= end_page:
            raise RuntimeError("开始页大于等于结束页");
        elif start_page < 1:
            raise RuntimeError("开始页小于结束页");
        elif end_page > self.end_page:
            raise RuntimeError("结束页大于总页数");
        else:
            for page in range(start_page, end_page):
                print("当前页:{}".format(page));
                page_list = self.get_page_ips(page);
                save_in(page_list);
                time.sleep(10);
        return page_list;

测试代码

if __name__ == "__main__":
    test = ProxyIPWorm();
    #
    https_list = test.get_page_ips("https", 1);
    http_list =  test.get_page_ips("http", 1);
    ips = IP_List();
    ips.https_list = https_list;
    ips.http_list = http_list;
    dom = proxy("http://news.gzcc.cn/html/xiaoyuanxinwen/", ips, True);
    print(dom);