爬取起点玄幻的简单信息~~纯属娱乐(新手上路,太难的也爬不到)

import requests
from lxml import etree
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"
}
def parse_url(url):
    book_details = {}
    response = requests.get (url=url, headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    book_name = html.xpath("//div[@class='book-info ']/h1/em/text()")[0]
    book_atuhor = html.xpath("//div[@class='book-info ']/h1/span/a/text()")[0]
    book_details["book_name"] = book_name
    book_details["book_atuhor"] = book_atuhor
    book_infos = html.xpath("//div[@class='book-intro']/p/text()")
    book_info = ""
    for x in range(len(book_infos)):
        book_info += book_infos[x].strip()
    book_details["book_info"] = book_info
    book_details["book_url"] = url
    return book_details

def get_page_urls(url):
    response = requests.get(url=url,headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    # print(etree.tostring(html,encoding="utf-8").decode("utf-8"))
    all_infos = html.xpath("//div[@class='book-mid-info']/h4/a/@href")
    #得到每页所有的书本
    for info in all_infos:
        info = "https:" + info
        book_details = parse_url(info)    #得到书本的详细信息返回输出
        print(book_details)
def splider():
    url = "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page={}"
    #得到所有的页数
    for x in range(1,10):
        base_url = url.format(x)
        get_page_urls(base_url)


if __name__ == "__main__":
    splider()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值