python线程详解爬小说_分享一个多线程抓取笔趣阁小说的爬虫

马上注册,结交更多好友,享用更多功能^_^

您需要 登录 才可以下载或查看,没有帐号?立即注册

x

网上的我看不懂,这是我自己学习实践写的。

import urllib.request as u

import os

from bs4 import BeautifulSoup

import lxml

import re

import threading

from queue import Queue

import datetime

def url_open(url):

try:

req = u.Request(url)

req.add_header('User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')

response = u.urlopen(url)

except Exception as e:

print('打开网页错误',e)

html = response.read()

html = html.decode('utf-8')

return html

def zhangjiedizhi(html):#需要url_open(url)传递

soup = BeautifulSoup(html,'lxml')

x=[]#存储各章节地址

for ts in soup.find_all('a',href=re.compile(".html")):

x.append('https://www.xbiquge6.com'+str(ts.get('href')))

x=x[12:]

return x

def zhangjieming(html):

soup = BeautifulSoup(html,'lxml')

x2=[]#存储章节名称

for ts in soup.find_all('a',href=re.compile(".html")):

x2.append(ts.get_text())

x2=x2[12:]

return x2

def gg(x,x2,q):

f = open(str(x2[0])+"x.txt",'a',encoding='utf-8')

neirong=[]

for i in range(len(x)):

try:

req = u.Request(x[i])

req.add_header('User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')

response = u.urlopen(x[i])

html = response.read()

html = html.decode('import urllib.request as u

import os

from bs4 import BeautifulSoup

import lxml

import re

import threading

from queue import Queue

import datetime

def url_open(url):

try:

req = u.Request(url)

req.add_header('User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')

response = u.urlopen(url)

except Exception as e:

print('打开网页错误',e)

html = response.read()

html = html.decode('utf-8')

return html

def zhangjiedizhi(html):#需要url_open(url)传递

soup = BeautifulSoup(html,'lxml')

x=[]#存储各章节地址

for ts in soup.find_all('a',href=re.compile(".html")):

x.append('https://www.xbiquge6.com'+str(ts.get('href')))

x=x[12:]

return x

def zhangjieming(html):

soup = BeautifulSoup(html,'lxml')

x2=[]#存储章节名称

for ts in soup.find_all('a',href=re.compile(".html")):

x2.append(ts.get_text())

x2=x2[12:]

return x2

def gg(x,x2,q):

f = open(str(x2[0])+"x.txt",'a',encoding='utf-8')

neirong=[]utf-8')

except Exception as e:

print('打开网页错误',e)

soup = BeautifulSoup(html,'lxml')

for e in soup.find_all('div',id="content"):

b=(str(x2[i])+'\n'+str(e.get_text())+'\n')

f.write(b)

f.close

print(x2[0],'写入完毕')

def thred():

q=Queue()

threads=[]

x=zhangjiedizhi(url_open(url))

x2=zhangjieming(url_open(url))

mm=len(x)

for i in range(10):

t=threading.Thread(target=gg,args=(x[int(0.1*mm*i):int(0.1*mm*(i+1))],x2[int(0.1*mm*i):int(0.1*mm*(i+1))],q))

t.start()

threads.append(t)

print('到这一切正常')

for each in threads:

each.join()

'''results=[]

for m in range(5):

results=results+(q.get())'''

print('上面的结束了')

a=''

print('到这一切正常2')

for i in range(10):

f = open(str(x2[int(0.1*i*mm)])+"x.txt",'r',encoding='utf-8')

a+=str(f.read())

f.close

x = open("小说合集.txt",'a',encoding='utf-8')

x.write(a)

x.close

if __name__ == "__main__":

starttime = datetime.datetime.now()

url='https://www.xbiquge6.com/82_82692/'

thred()

endtime = datetime.datetime.now()

print (endtime - starttime)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值