python线程详解爬小说_分享一个多线程抓取笔趣阁小说的爬虫

最新推荐文章于 2020-12-10 09:45:53 发布

weixin_39621794

最新推荐文章于 2020-12-10 09:45:53 发布

阅读量128

点赞数

文章标签： python线程详解爬小说

马上注册，结交更多好友，享用更多功能^_^

您需要登录才可以下载或查看，没有帐号？立即注册

网上的我看不懂，这是我自己学习实践写的。

import urllib.request as u

import os

from bs4 import BeautifulSoup

import lxml

import re

import threading

from queue import Queue

import datetime

def url_open(url):

try:

req = u.Request(url)

req.add_header('User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')

response = u.urlopen(url)

except Exception as e:

print('打开网页错误',e)

html = response.read()

html = html.decode('utf-8')

return html

def zhangjiedizhi(html):#需要url_open(url)传递

soup = BeautifulSoup(html,'lxml')

x=[]#存储各章节地址

for ts in soup.find_all('a',href=re.compile(".html")):

x.append('https://www.xbiquge6.com'+str(ts.get('href')))

x=x[12:]

return x

def zhangjieming(html):

soup = BeautifulSoup(html,'lxml')

x2=[]#存储章节名称

for ts in soup.find_all('a',href=re.compile(".html")):

x2.append(ts.get_text())

x2=x2[12:]

return x2

def gg(x,x2,q):

f = open(str(x2[0])+"x.txt",'a',encoding='utf-8')

neirong=[]

for i in range(len(x)):

try:

req = u.Request(x[i])

req.add_header('User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')

response = u.urlopen(x[i])

html = response.read()

html = html.decode('import urllib.request as u

import os

from bs4 import BeautifulSoup

import lxml

import re

import threading

from queue import Queue

import datetime

def url_open(url):

try:

req = u.Request(url)

req.add_header('User-Agent' , 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')

response = u.urlopen(url)

except Exception as e:

print('打开网页错误',e)

html = response.read()

html = html.decode('utf-8')

return html

def zhangjiedizhi(html):#需要url_open(url)传递

soup = BeautifulSoup(html,'lxml')

x=[]#存储各章节地址

for ts in soup.find_all('a',href=re.compile(".html")):

x.append('https://www.xbiquge6.com'+str(ts.get('href')))

x=x[12:]