爬虫练习
我的第一个爬虫
如有侵权立马删
# -*- coding: utf-8 -*-
# @Time : 2022/4/4 17:19
# @File : 笔趣阁.py
# @Software: PyCharm
import os
import time
import openpyxl
import requests
from bs4 import BeautifulSoup
# import UserAgent
url_head = "https://m.jx.la"
home_url = "https://m.jx.la/sort/1/week/"
xiaoshuoming = []
zuozhe_name = []
leixing = []
pianshu = []
fenlei_url_name = {}
xiaoshuoname_url_zuozhe = {}
#解析方法
def base_parse(url, lebla, classname, classattrs):
# headers = UserAgent.get_headers() # 随机获取一个headers
html = requests.get(url=url)#headers=headers
soup = BeautifulSoup(html.text, "html.parser")
for i in soup.find_all(name=lebla, attrs={classname: classattrs}):
return i
#获取分类链接
def get_sort_url(url):
#定位到链接标签
biaoqian_name = 'section'
class_name = 'class'
class_attrs = 'nav'
soup = base_parse(url, biaoqian_name, class_name, class_attrs)
#获取链接
biaoqian_name_2 = "a"
href_name = "href"
soup2 = soup.find_all(name=biaoqian_name_2)
for i in soup2:
fenlei_name = i.string
fenlei_url = url_head + str(i.get(href_name))
global fenlei_url_name
fenlei_url_name[fenlei_name] = {
"url": fenlei_url
}
#获取具体链接
def jutixiaoshuo(input_fenlei):
lei_url = fenlei_url_name[input_fenlei]["url"]
biaoqian_name = 'ul'
class_name = "class"
class_attrs = "list-1"
soup = base_parse(lei_url,biaoqian_name,class_name,class_attrs)
guolv = soup.find_all(name="a",attrs={"class":"info"})
for i in guolv:
xiaoshuo_url = i.get("href")
xiaoshuo_url = url_head + xiaoshuo_url
xiaoshuo_name = i.find("h3")
xiaoshuo_zuozhe = i.find("p")
for name_get,zuozhe in zip(xiaoshuo_name, xiaoshuo_zuozhe):
name_get = name_get.string
zuozhe = zuozhe.string
book_author = name_get + "--作者:" + zuozhe + ""
global xiaoshuoname_url_zuozhe
xiaoshuoname_url_zuozhe[name_get] = {
'url': xiaoshuo_url,
"作者和书名": book_author
}
#获取具体总章数链接
def get_book_num(input_book):
download_url = xiaoshuoname_url_zuozhe[input_book]["url"]
xiaoshuoming.append(input_book)
path = f"{input_book}.xlsx"
if not os.path.exists(path): # 判断是否需要创建文件
wbook = openpyxl.Workbook() # 创建Excel对象
wbook.save(path)#创建空文件
print("文件创建成功!")
else:
print("文件已存在!")
#解析当前页面信息
xiaoshuo_data_lebla = "section"
class_data = "class"
classattrs_data = "g-detail"
soup = base_parse(download_url, xiaoshuo_data_lebla, class_data, classattrs_data)
# 获取作者等信息
get_xiaoshuo_data = soup.find_all("div", {"class":"info"})
for i in get_xiaoshuo_data:
xiaoshuo_type = i.find("p", {"class": "type"}).text
leixing.append(xiaoshuo_type)
author = i.find("p",{"class": "author"}).text
zuozhe_name.append("作者:"+author)
biaoqianname = "div"
idname = "id"
classattrs = "main"
#进入的书名
get_zhangshu = soup.find(biaoqianname, {idname: classattrs})
#获得章数页面
soup_zhangshu = get_zhangshu.find_all("option")
for i1 in soup_zhangshu:
booknum_url1 = i1.get("value")
booknum_url = url_head + booknum_url1
pianshu.append(booknum_url)
for i1 in soup_zhangshu:
booknum_url1 = i1.get("value")
booknum_url = url_head + booknum_url1
get_content_url(booknum_url)
def input_():
print("{------------biquge------}")
for i in fenlei_url_name.keys():
print(i, end=" ")
print()
input_fenlei = input("输入小说分类:") or "武侠仙侠"
if input_fenlei not in fenlei_url_name.keys():
print("未找到该分类,请重新输入!")
print("----------------------")
input_()
else:
jutixiaoshuo(input_fenlei)
for i1 in xiaoshuoname_url_zuozhe.values():
print(i1["作者和书名"] + '\n', end=" ")
print()
input_book = input("输入小说书名:") or "武道24小时自动苦练我直接开挂"
if input_book not in xiaoshuoname_url_zuozhe.keys():
print("未找到该小说,请重新输入!")
print("----------------------")
input_()
else:
get_book_num(input_book)
#获取第一章内容页面
def get_content_url(booknum_url):
#找到内容链接标签
content_lebla = "div"
idname = "id"
neirongclassattrs = "main"
soup_href = base_parse(booknum_url, content_lebla, idname, neirongclassattrs)
#获得每一章内容内容链接
soup_href = soup_href.find(name="ul",attrs={"class":"list"}).find_all("a")
for i in soup_href:
xiaoshuo_neirong = i.get("href")
xiaoshuo_neirong_url = url_head + xiaoshuo_neirong
get_book_content(xiaoshuo_neirong_url)
# 进入内容页面
def get_book_content(xiaoshuo_neirong_url):
soup = base_parse(xiaoshuo_neirong_url, None, None, None)
#获取小说章数名字
title_labla = "h2"
class_name = "class"
titile_attrs = "chapter-name"
title_name = soup.find(title_labla, {class_name: titile_attrs})
title_name_text = title_name.text
#获取第一页内容
textbiaoqian = "div"
textid = "id"
textattrs = "content"
text1 = soup.find(textbiaoqian, {textid: textattrs}) # 获取到第一页的内容
text1 = text1.text
# 判断有没有下一页
next_lebla = "section"
next_class_attrs = "g-content-nav"
next_class = "class"
assert_nextpage = soup.find(next_lebla, {next_class: next_class_attrs}).find_all("a")
texts2 = None
for i in assert_nextpage:
if i.text == "下一页" or i.text == "下一章":
next_url = i.get("href")
# 获取内容第二页链接
next_url = url_head + next_url
textbiaoqian = "div"
textid = "id"
textattrs = "content"
# 获取内容第二页内容
soup = base_parse(next_url, textbiaoqian, textid, textattrs)
texts2 = soup.text
else:
# print("没有下一页了")
continue
xiaoshuotext = text1+texts2
xiaoshuoname_, zuozhename_, leixingname_ = None, None, None
for xiaoshuomingzi,zuozhemingzi,leixingmingzi in zip(xiaoshuoming, zuozhe_name, leixing):
xiaoshuoname_ = xiaoshuomingzi
zuozhename_ = zuozhemingzi
leixingname_ = leixingmingzi
# 读取Excel表
path = f"{xiaoshuoname_}.xlsx"
wb = openpyxl.load_workbook(path) # 创建Excel对象
sheet1 = wb['Sheet']
# 多行数据的写入
for i in range(1):
sheet1.append([xiaoshuoname_, zuozhename_, leixingname_, title_name_text, xiaoshuotext])#添加数据到表里
#这里打开可以随时停止
wb.save(path) # 保存Excel文件,
print("【" + title_name_text + "】保存完成")
time.sleep(5)
#这里打开后要等整篇爬取完才保存excel ,与上面二选一
#wb.save(f'{xiaoshuoming}.xlsx') # 保存Excel文件,
def main():
get_sort_url(home_url)
input_()
if __name__ == '__main__':
main()