Python多线程抓取全书网全站

#!/usr/bin/env python
# -*- coding:utf-8 -*-
#@author:Chris iven
#Python version 3.6
from lxml import etree
import requests,random,re
from requests.exceptions import RequestException,ConnectionError,ReadTimeout
from fake_useragent import UserAgent
import queue,threading,pymysql
class QS_Spider(object):
    #是否要初始化?感觉不用吧!
    def __init__(self,url):
        self.url = url
    def Get_All_Page_Number(self):
        #所有类型的页面数和标题名称!
        ua = UserAgent()
        headers = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                    "Accept-Encoding":"gzip,deflate",
                    "Accept-Language":"zh-CN,zh;q=0.8",
                   "User-Agent":ua.random,
                   "Referer":"http://www.quanshuwang.com/"}
        print("当前的UA是:",headers["User-Agent"])
        try:
            response = requests.get(self.url,headers=headers,timeout=10)
        except ConnectionError:
            return None
        except ReadTimeout:
            return None
        except RequestException:
            return None
        response.encoding="gbk"
        if response.status_code == 200:
            print("访问成功!")
            html = etree.HTML(response.text)
            All_Link = html.xpath('//ul[@class="channel-nav-list"]/li/a/@href')#所有的链接!
            All_Link_Name = html.xpath('//ul[@class="channel-nav-list"]/li/a/text()')
            Page_Data = []#放入所有链接的最大页面数量!
            if All_Link is not None:
                for i in All_Link:
                    try:
                        child_response = requests.get(i,headers=headers,timeout=10)
                    except ConnectionError:
                        return None
                    child_response.encoding ="gbk"
                    child_html = etree.HTML(child_response.text)
                    Page_Data.append(int(child_html.xpath('//a[@class="last"]/text()')[0]))

        return Page_Data,All_Link_Name

    def Struc_Ture_URL(self,Number,param):
        #组合各种参数!
        #返回的是一个队列!
        url_queue = queue.Queue()
        n_url = "http://www.quanshuwang.com/list/"
        for i in range(1,param+1):
            url = n_url+str(Number)+"_"+str(i)+".html"
            print("正在放入:",url)
            url_queue.put_nowait(url)
        return url_queue
    def Request_Url(self,q,Name):
        while True:
            try:
                url = q.get_nowait()
            except Exception as e:
                break
            print("线程名称:%s, 链接:%s"%(threading.current_thread().name,url))
            ua = UserAgent()
            headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                           "Accept-Encoding": "gzip,deflate",
                           "Accept-Language": "zh-CN,zh;q=0.8",
                           "User-Agent": ua.random,
                           "Referer": "http://www.quanshuwang.com/"}
            try:
                response = requests.get(url,headers=headers,timeout=10)
            except ConnectionError:
                print("连接错误!")
                return None
            except RequestException:
                print("错误!")
                return None
            response.encoding ="gbk"
            if response.status_code == 200:
                print("访问成功!")
                self.Parsing_Html(response.text,Name)

    def Parsing_Html(self,response,Name):
        html = etree.HTML(response)
        url = html.xpath('//a[@class="l mr10"]/@href')
        img = html.xpath('//a[@class="l mr10"]/img/@src')
        title = html.xpath('//span[@class="l"]/a/@title')
        author = re.findall('<span class="l".*?<a href=.*?>(.*?)</a><em.*?',response,re.S)
        print("url:",url,"\n")
        print("img:",img,"\n")
        print("title:",title,"\n")
        self.write_to_mysql(title,url,img,author,Name)

    def write_to_mysql(self,title,url,img,author,mysql_name):
        j = 0
        db = pymysql.connect(host="localhost", user="root", password='123456', db="quanshu_mysql", charset="utf8")
        cursor = db.cursor()
        try:
            cursor.execute("create table " + mysql_name + "(id int primary key auto_increment not null,title varchar(50),url varchar(100) not null,img varchar(100),author varchar(40));")
        except:
            pass
        print(mysql_name, "数据库创建成功!")
        while j < len(url):
            try:
                print("正在写入数据:",title[j])
                cursor.execute('insert into '+mysql_name+'(title,url,img,author) values("%s","%s","%s","%s")'%(title[j],url[j],img[j],author[j]))
                db.commit()
                print(title[j],"写入成功")
            except IndexError:
                print(mysql_name,"里面的数据写入成功!")
                break
            j+=1

    def Start_Spider(self):
        Page_Number,Name = self.Get_All_Page_Number()
        #Name是所有链接的名称!
        num = 1
        N = 0
        Queue_List = []
        Threads = []
        while num <= 12:
            que = self.Struc_Ture_URL(num,Page_Number[N])
            Queue_List.append(que)
            num+=1
            N+=1
            #解释: while循环是将所有的队列添加进Queue_List队列里面!
        Q_N = 0 
        name_num = 0
        while Q_N < len(Queue_List):
            t = threading.Thread(target=self.Request_Url,args=(Queue_List[Q_N],Name[name_num],),name="Spider0"+str(name_num))
            Threads.append(t)
            name_num+=1
            Q_N+=1

            #遍历队列,然后将队列里面的所有queue添加进函数Request_Url
        for t in Threads:
            t.start()#开启线程活动!
        for t in Threads:
            t.join()

if __name__ == "__main__":
    URL = "http://www.quanshuwang.com/"
    C = QS_Spider(URL)
    C.Start_Spider()
    print("it's ok!")
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值