使用爬虫爬取晋江排行榜上文的基本数据
该网站的root.txt:http://jjwxc.net/robots.txt
此内容显示可以爬取
视频版本可见B站:https://www.bilibili.com/video/BV1n7411F7w9/
纯原创
未经作者许可请勿转载
喜欢的话可以点个赞哟(^U^)ノ~YO
具体的代码:
(python)
import requests
from lxml import etree
import xlrd
import xlwt
from xlutils.copy import copy
import numpy as np
Time = 2
# 根据每一页url规律构造出url
def get_url(page):
head = 'http://www.jjwxc.net/bookbase.php?fw0=0&fbsj=0&ycx0=0&xx2=2&mainview0=0&sd0=0&lx0=0&fg0=0&isfinish=0&collectiontypes=ors&searchkeywords=&page='
tail = '&sortType=2'
variable = str(page)
url = head + variable + tail
return url
def get_info(url):
# 伪装成浏览器,防止封ip
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Host': 'jjwxc.net',
'Cookie': 'UM_distinctid=16eaac573a120d-0f8ab6f28bc0a9-2393f61-144000-16eaac573a3120; __gads=ID=38dfefa1793ebbdd:T=1574823100:S=ALNI_MZJh-W_lqt5EH0dwp0P9dZqpPtUiA; __cfduid=d67904808eca374ffe03186aeef0a08621574826396; timeOffset_o=2480.199951171875; testcookie=yes; Hm_lvt_bc3b748c21fe5cf393d26c12b2c38d99=1582207503,1582269853,1582294742,1582343217; token=Mjk5MDY4Mjd8N2VmY2EyOWQ5Y2YzMWI5MWFmMzQ1YjVmNGI5Mjk3Y2N8fHx8MjU5MjAwMHwxfHx85qyi6L%2BO5oKo77yM5pmL5rGf55So5oi3fDF8bW9iaWxl; JJEVER=%7B%22fenzhan%22%3A%22noyq%22%2C%22sms_total%22%3A%220%22%2C%22ispayuser%22%3A%2229906827-1%22%2C%22foreverreader%22%3A%2229906827%22%7D; JJSESS=%7B%22sidkey%22%3A%22G7h6yLMTHXFEBZWbmqPrSlReuD9itsUYN52zjpkawC%22%2C%22nicknameAndsign%22%3A%222%257E%2529%2524%25E6%25B4%25BB%25E7%259D%2580%25E4%25B8%25BA%25E4%25BA%2586%25E4%25BB%2580%25E4%25B9%2588%22%7D; CNZZDATA30075907=cnzz_eid%3D88173465-1582114410-%26ntime%3D1582345015; Hm_lpvt_bc3b748c21fe5cf393d26c12b2c38d99=1582345163',
}
# 防止爬虫突然断掉,使其重复执行访问
tries = 10
while tries > 0:
try:
rsp = requests.get(url, headers=headers)
break
except Exception as e:
tries -= 1
# print(e)
# 防止中文乱码
rsp.encoding = rsp.apparent_encoding
data = rsp.text
selector = etree.HTML(data)
# 类型:
Type = selector.xpath('/html/body/table/tbody/tr/td[3]/text()')
del Type[0]
for i in range(0, len(Type)):
Type[i] = Type[i].strip()
# 风格:
Style = selector.xpath('/html/body/table/tbody/tr/td[4]/text()')
del Style[0]
for i in range(0, len(Style)):
Style[i] = Style[i].strip()
# 字数:
Word = selector.xpath('/html/body/table/tbody/tr/td[6]/text()')
del Word[0]
for i in range(0, len(Word)):
Word[i] = Word[i].strip()
# 字典用于储存信息
info = {}
info['类型'] = Type # 定位类型
info['风格'] = Style # 定位风格
info['字数'] = Word # 定位字数
# print(info)
return info
def export_excel(export, lastline):
oldWb = xlrd.open_workbook("G:\www\web_crawl2\统计1[晋江][按收藏].xls") # 先打开已存在的表
newWb = copy(oldWb) # 复制
newWs = newWb.get_sheet(0)
# 写入key的值
lst = list(export.keys())
# 往单元格内写入内容:写入表头
j = 0
for i in range(lastline, lastline + len(lst)):
newWs.write(i, Time * 2, label=lst[j])
j = j + 1
# 往单元格内写入内容:写入内容
i = lastline
for word in export:
newWs.write(i, Time * 2 + 1, label=export[word])
i = i + 1
newWb.save("G:\www\web_crawl2\统计1[晋江][按收藏].xls")
# 计算列表里各元素重复次数:
def all_list(arr):
result = {}
for i in set(arr):
result[i] = arr.count(i)
return result
def word_list(arr):
result = {}
arr.sort()
start = 0
# step = int((max(arr)-min(arr))/10+1) # 等距间隔
num = 0
# 第一段0—2^15这段:
for i in range(0, len(arr)):
if arr[i] <= pow(2, 15):
num = num + 1
else:
result[str(0), str(pow(2, 15))] = num
start = i
break
# 中间2^15—2^23这段
step = 16
j = pow(2, step)
while j < pow(2, 23): # 指数形间隔
nums = 0
for i in range(start, len(arr)):
if arr[i] <= j:
nums = nums + 1
else:
start = i
break
result[str(int(j/2)), str(j)] = nums
step = step + 1
j = pow(2, step)
# 最后的2^23—无穷:
result['大于2^23'] = len(arr[start: len(arr)])
# step = 50000
# start = 0
# for j in range(min(arr)+step, min(arr)+10 * (step+1)+1, step):
# nums = 0
# for i in range(start, len(arr)):
# if arr[i] <= j:
# nums += 1
# else:
# start = i
# break
# if nums != 0: result[range(j - step, j)] = nums
return result
def main():
info_list = {'类型': [], '风格': [], '字数': []}
for page in range(1, 50):
print(page)
link = get_url(page)
infos = get_info(link)
for key in info_list:
info_list[key] += infos[key]
dils1 = all_list(info_list['类型'])
Dict1 = dict(sorted(dils1.items(), key=lambda item: item[0], reverse=True))
print(Dict1)
dils2 = all_list(info_list['风格'])
Dict2 = dict(sorted(dils2.items(), key=lambda item: item[0], reverse=True))
print(Dict2)
dils3 = list(map(int, info_list['字数']))
Dict3 = word_list(dils3)
print(Dict3)
export_excel(Dict1, 0) # 导入excel
export_excel(Dict2, 80) # 导入excel
export_excel(Dict3, 86) # 导入excel
if __name__ == '__main__':
main()