import requests
from lxml import etree
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
}
def parse_url(url):
book_details = {}
response = requests.get (url=url, headers=HEADERS)
text = response.text
html = etree.HTML(text)
book_name = html.xpath("//div[@class='book-info ']/h1/em/text()")[0]
book_atuhor = html.xpath("//div[@class='book-info ']/h1/span/a/text()")[0]
book_details["book_name"] = book_name
book_details["book_atuhor"] = book_atuhor
book_infos = html.xpath("//div[@class='book-intro']/p/text()")
book_info = ""
for x in range(len(book_infos)):
book_info += book_infos[x].strip()
book_details["book_info"] = book_info
book_details["book_url"] = url
return book_details
def get_page_urls(url):
response = requests.get(url=url,headers=HEADERS)
text = response.text
html = etree.HTML(text)
# print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
all_infos = html.xpath("//div[@class='book-mid-info']/h4/a/@href")
#得到每页所有的书本
for info in all_infos:
info = "https:" + info
book_details = parse_url(info) #得到书本的详细信息返回输出
print(book_details)
def splider():
url = "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page={}"
#得到所有的页数
for x in range(1,10):
base_url = url.format(x)
get_page_urls(base_url)
if __name__ == "__main__":
splider()
爬取起点玄幻的简单信息~~纯属娱乐(新手上路,太难的也爬不到)
最新推荐文章于 2024-05-30 10:32:39 发布