#!/usr/bin/env python
# -*- coding:utf-8 -*-
#@author:Chris iven
#Python version 3.6
from lxml import etree
import requests,random,re
from requests.exceptions import RequestException,ConnectionError,ReadTimeout
from fake_useragent import UserAgent
import queue,threading,pymysql
class QS_Spider(object):
#是否要初始化?感觉不用吧!
def __init__(self,url):
self.url = url
def Get_All_Page_Number(self):
#所有类型的页面数和标题名称!
ua = UserAgent()
headers = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding":"gzip,deflate",
"Accept-Language":"zh-CN,zh;q=0.8",
"User-Agent":ua.random,
"Referer":"http://www.quanshuwang.com/"}
print("当前的UA是:",headers["User-Agent"])
try:
response = requests.get(self.url,headers=headers,timeout=10)
except ConnectionError:
return None
except ReadTimeout:
return None
except RequestException:
return None
response.encoding="gbk"
if response.status_code == 200:
print("访问成功!")
html = etree.HTML(response.text)
All_Link = html.xpath('//ul[@class="channel-nav-list"]/li/a/@href')#所有的链接!
All_Link_Name = html.xpath('//ul[@class="channel-nav-list"]/li/a/text()')
Page_Data = []#放入所有链接的最大页面数量!
if All_Link is not None:
for i in All_Link:
try:
child_response = requests.get(i,headers=headers,timeout=10)
except ConnectionError:
return None
child_response.encoding ="gbk"
child_html = etree.HTML(child_response.text)
Page_Data.append(int(child_html.xpath('//a[@class="last"]/text()')[0]))
return Page_Data,All_Link_Name
def Struc_Ture_URL(self,Number,param):
#组合各种参数!
#返回的是一个队列!
url_queue = queue.Queue()
n_url = "http://www.quanshuwang.com/list/"
for i in range(1,param+1):
url = n_url+str(Number)+"_"+str(i)+".html"
print("正在放入:",url)
url_queue.put_nowait(url)
return url_queue
def Request_Url(self,q,Name):
while True:
try:
url = q.get_nowait()
except Exception as e:
break
print("线程名称:%s, 链接:%s"%(threading.current_thread().name,url))
ua = UserAgent()
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip,deflate",
"Accept-Language": "zh-CN,zh;q=0.8",
"User-Agent": ua.random,
"Referer": "http://www.quanshuwang.com/"}
try:
response = requests.get(url,headers=headers,timeout=10)
except ConnectionError:
print("连接错误!")
return None
except RequestException:
print("错误!")
return None
response.encoding ="gbk"
if response.status_code == 200:
print("访问成功!")
self.Parsing_Html(response.text,Name)
def Parsing_Html(self,response,Name):
html = etree.HTML(response)
url = html.xpath('//a[@class="l mr10"]/@href')
img = html.xpath('//a[@class="l mr10"]/img/@src')
title = html.xpath('//span[@class="l"]/a/@title')
author = re.findall('<span class="l".*?<a href=.*?>(.*?)</a><em.*?',response,re.S)
print("url:",url,"\n")
print("img:",img,"\n")
print("title:",title,"\n")
self.write_to_mysql(title,url,img,author,Name)
def write_to_mysql(self,title,url,img,author,mysql_name):
j = 0
db = pymysql.connect(host="localhost", user="root", password='123456', db="quanshu_mysql", charset="utf8")
cursor = db.cursor()
try:
cursor.execute("create table " + mysql_name + "(id int primary key auto_increment not null,title varchar(50),url varchar(100) not null,img varchar(100),author varchar(40));")
except:
pass
print(mysql_name, "数据库创建成功!")
while j < len(url):
try:
print("正在写入数据:",title[j])
cursor.execute('insert into '+mysql_name+'(title,url,img,author) values("%s","%s","%s","%s")'%(title[j],url[j],img[j],author[j]))
db.commit()
print(title[j],"写入成功")
except IndexError:
print(mysql_name,"里面的数据写入成功!")
break
j+=1
def Start_Spider(self):
Page_Number,Name = self.Get_All_Page_Number()
#Name是所有链接的名称!
num = 1
N = 0
Queue_List = []
Threads = []
while num <= 12:
que = self.Struc_Ture_URL(num,Page_Number[N])
Queue_List.append(que)
num+=1
N+=1
#解释: while循环是将所有的队列添加进Queue_List队列里面!
Q_N = 0
name_num = 0
while Q_N < len(Queue_List):
t = threading.Thread(target=self.Request_Url,args=(Queue_List[Q_N],Name[name_num],),name="Spider0"+str(name_num))
Threads.append(t)
name_num+=1
Q_N+=1
#遍历队列,然后将队列里面的所有queue添加进函数Request_Url
for t in Threads:
t.start()#开启线程活动!
for t in Threads:
t.join()
if __name__ == "__main__":
URL = "http://www.quanshuwang.com/"
C = QS_Spider(URL)
C.Start_Spider()
print("it's ok!")