爬虫起点小说网所有小说基本信息
第一篇博客,先试试水。爬虫你们懂的,三小时5万条数据:
- 多线程
- 失败再爬取机制
- 多次失败链接储存再爬取
- 自定义数据量
代码块
导入需要的包
# -*- coding: utf-8 -*-
import time
import datetime
import threadpool
from bs4 import BeautifulSoup
import csv
import requests
from urllib.parse import urlencode
将每本小说url储存到urls.txt:
def load(i,count=0):
try:
url="https://www.qidian.com/all?page="+str(i)
print("正在采集页面:{}".format(url))
page=requests.get(url)
page.encoding="utf-8"
soup = BeautifulSoup(page.text, 'lxml')
elem=soup.select(".book-mid-info h4 a")#选取url
urls=[]
for j in range(0,20):
url = 'https:' + elem[j].get('href')
urls.append(url)
if len(urls)!=20:
raise Exception(BaseException, i)
with open('urls.txt', 'a', encoding='utf-8') as f:#写入文件
for cont in urls:
f.write(str(cont)+'\n')
except BaseException as e:
if count<5:
load(i,count+1)
else:
print(str(e))
with open('urllist.txt','a',encoding='utf-8') as fp:
fp.write(url+' '+i+'\n')
def loadurl(start,end,thrednum):
links = []
for i in range(start,end+1):#自定义页数
links.append(i)
#开始采集小说url
print(len(links))
try:
pool = threadpool.ThreadPool(thrednum) # 线程池
requests = threadpool.makeRequests(load, links)
[pool.putRequest(req) for req in requests]
pool.wait()
except KeyboardInterrupt:
print('手动暂停')
初始化qidian.csv文件,仅能执行一次:
def init():
row = ['book_name', 'author', 'words_count', 'click_count', 'books_count', 'score', 'j_user_count','crawl_time','id']
#row = ['小说名', '作者', '字数', '点击量', '作品个数', '评分', '评价人数', '抓取时间', 'url']
with open("qidian.csv", "w", newline="") as f:
f = csv.writer(f, dialect="excel")
f.writerow(row)
读取urls.txt文件,将小说转换成记录储存到qidian.csv。:
def work(url, count=0):
page = requests.get(url)
page.encoding = "utf-8"
soup = BeautifulSoup(page.text, 'lxml')
try:
# 选择元素
elem = soup.select(".book-info h1 em")
book_name = elem[0].text
author = soup.select(".writer")[0].text
words_count = soup.select(".book-info p em")[0].text
click_count = soup.select(".book-info p em")[1].text
books_count = soup.select(".work-state li em")[0].text
id = url.replace("https://book.qidian.com/info/", "")
crawl_time=get_unix_time()
print(url)
# score = soup.select("#score1")[0].text + '.' + soup.select("#score2")[0].text
# j_user_count = soup.select("#j_userCount span")[0].text
bookid = id
data = {
'_csrfToken': 'QpbsVhyc5zc0h21NiEweIrLMu2tFOM1RsgfZtWSS',
'bookId': bookid,
'pageSize': 15
}
other_url = 'https://book.qidian.com/ajax/comment/index?' + urlencode(data)
page = requests.get(other_url, stream=True)
page.encoding = "utf-8"
cont = eval(page.text)
score = cont.get('data').get('rate')
j_user_count = cont.get('data').get('userCount')
# 写:追加
row = [book_name, author, words_count, click_count, books_count, score, j_user_count, crawl_time, id]
with open("qidian.csv", "a", encoding="utf-8",newline='') as f:
f = csv.writer(f, dialect="excel")
f.writerow(row)
with open("doneurl.txt", "a", newline='',encoding='utf-8') as fe:
fe.write(url + '\n')
fe.close()
except BaseException:
if count < 5:
print('errror 元素获取失败 重试次数:' + str(count))
time.sleep(2)
work(url, count+1)
else:
with open("error_url.txt", "a", encoding='utf-8') as fe:
fe.write(url + '\n')
print('errror 元素获取失败 写入文件')
fe.close()
其他函数及爬虫启动函数
#时间戳
def get_unix_time(): # 获取unix时间戳
dtime = datetime.datetime.now()
ans_time = int(time.mktime(dtime.timetuple()))
return ans_time
#爬虫启动
def spider(start=1,end=2500,thrednum=10): #输入文件输出文件
#采集每本小说url储存到文件
loadurl(start,end,thrednum)
#将url读取到list
with open('urls.txt', 'r+', encoding='utf-8') as f:
links = []
url = f.readline().strip('\n')
while url:
links.append(url)
url = f.readline().strip('\n')
#开始采集每条记录
init()
try:
pool = threadpool.ThreadPool(thrednum) # 线程池
requests = threadpool.makeRequests(work, links)
[pool.putRequest(req) for req in requests]
pool.wait()
except KeyboardInterrupt:
print('手动暂停')
爬虫启动
spider(1,2500,20)
从第1页爬取到第2500页,20条线程.一共200条记录
作者想说的话
- 本人第一次发博客,必有很多解释不到之处,还请大家多多指教。
- 欢迎任何人发现本人代码中存在的问题或者可以改进的地方与本人交流,必将感激不尽
- 本人qq:289672494 常用
- 希望大家共同进步
- 需要获取其他的元素改部分即可
注:本人必将遵守法律法规,不发生任何盗取网站数据或影响网站运营的行为,此篇文章仅供广大博友或来访者参考。倡导绿色安全网络环境,人人有责。