笔趣阁（www.853.la）小说爬取代码（python实现）

最新推荐文章于 2024-08-22 16:55:02 发布

qq_29381635

最新推荐文章于 2024-08-22 16:55:02 发布

阅读量6.9k

点赞数

本文链接：https://blog.csdn.net/qq_29381635/article/details/100043983

版权

import requests
import os
import gevent
from gevent import monkey
import random
import re
from lxml import etree
from bs4 import BeautifulSoup

monkey.patch_all(select=False)
from urllib import parse
import time

IPs = [{‘HTTPS’: ‘HTTPS://182.114.221.180:61202’},
{‘HTTPS’: ‘HTTPS://60.162.73.45:61202’},
{‘HTTPS’: ‘HTTPS://113.13.36.227:61202’},
{‘HTTPS’: ‘HTTPS://1.197.88.101:61202’}]
HEADERS = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36’,
‘Accept’: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8’,
‘Accept-Language’: ‘zh-CN,zh;q=0.9’,
‘Cookie’: ‘UM_distinctid=1638b54c8f3279-0003db1d70474a-39614807-384000-1638b54c8f4843; CNZZDATA1261736110=613318700-1527048008-null%7C1530014624; Hm_lvt_5ee23c2731c7127c7ad800272fdd85ba=1530014621,1530014629,1530014706,1530015295; bookid=34778; bcolor=; font=; size=; fontcolor=; width=; chapterid=1896093; chaptername=%25u7B2C1%25u7AE0%2520%25u65B0%25u4E16%25u754C%25u548C%25u65B0%25u8EAB%25u4EFD; Hm_lpvt_5ee23c2731c7127c7ad800272fdd85ba=1530016490’
}

def setDir():
if ‘Noval’ not in os.listdir(’./’):
os.mkdir(’./Noval’)

def getNoval(url, id, data, faillist):
try:
headers = HEADERS
IP = random.choice(IPs)
res = requests.get(url, headers=headers, proxies=IP, timeout=5)
res.encoding = res.apparent_encoding
html = res.text.replace(’ ', ’ ') # 替换掉这个字符换成空格~ 意思是一样的
soup = BeautifulSoup(html, ‘lxml’)
content = soup.find(‘div’, attrs={‘id’: ‘content’})
name = soup.find(‘div’, attrs={‘class’: ‘bookname’}).h1.text

    if name:
        s = name + '\n'
        s = s + content.text
        data[id] = s

except Exception:
    faillist.append(id)

def getNoval2(url, id, data):
while True:
try:
headers = HEADERS
IP = random.choice(IPs)
res = requests.get(url, headers=headers, proxies=IP)
res.encoding = res.apparent_encoding
html = res.text.replace(’ ', ’ ') # 替换掉这个字符换成空格~ 意思是一样的
soup = BeautifulSoup(html, ‘lxml’)
content = soup.find(‘div’, attrs={‘id’: ‘content’})
name = soup.find(‘div’, attrs={‘class’: ‘bookname’}).h1
if name:
s = name.text + ‘\n’
s = s + content.text
data[id] = s
except Exception:
continue
else:
break

def getContentFile2(url):
headers = HEADERS
IP = random.choice(IPs)
res = requests.get(url, headers=headers, proxies=IP)
res.encoding = res.apparent_encoding
soup = BeautifulSoup(res.text, ‘lxml’)
info = soup.find(‘div’, attrs={‘id’: ‘info’})
bookname = info.h1.text
datalist = soup.find(‘div’, attrs={‘id’: ‘list’})
data = list(
map(lambda x: ‘https://www.853.la’ + x[‘href’], datalist.find_all(attrs={‘href’: re.compile(’/book.*?.html’)})))
return data, bookname

def BuildGevent(baseurl):
content, bookname = getContentFile2(baseurl) # version2
steps = 2
beginIndex, length = steps, len(content)
count = 0
name = “%s.txt” % bookname
data = {}
faillist = []
while (count - 1) * steps < length:
WaitigList = [gevent.spawn(getNoval, content[i + count * steps], i + count * steps, data, faillist) for i in
range(steps) if
i + count * steps < length]
gevent.joinall(WaitigList)
print(count)
count += 1
count = 0
print(“HE”)
faillistlen = len(faillist)
while count * steps < faillistlen:
WaitigList = [gevent.spawn(getNoval2, content[faillist[i + count * steps]], faillist[i + count * steps], data)
for i in range(steps) if i + count * steps < faillistlen]
gevent.joinall(WaitigList)
String = ‘\n’.join(data.values())
with open(’./Noval/’ + name, ‘w’, encoding=‘gb18030’, errors=‘ignore’) as ff:
ff.write(String)

if name == ‘main’:
starttime = time.time()
setDir()
url = ‘http://www.853.la/shu1025/’
BuildGevent(url)
endtime = time.time()
print(“Total use time: %.6f” % (endtime - starttime))
————————————————