python爬虫 --单线程爬取笔趣网的小说【基础版】

最新推荐文章于 2021-07-25 11:15:40 发布

D_dalei

最新推荐文章于 2021-07-25 11:15:40 发布

阅读量299

点赞数

分类专栏：爬虫文章标签： python xpath url

本文链接：https://blog.csdn.net/D_wart/article/details/103695850

版权

爬虫专栏收录该内容

36 篇文章 4 订阅

订阅专栏

import requests
from lxml import etree
import os
def get_proxies():
    try:
        response = requests.get('http://localhost:5000/get')
        proxy = response.text
        proxies = {
            'http': 'http://' + proxy
        }
        return proxies
    except Exception:
        return None
def get_xpath_by_requests(url,proxies):
    '''

    :param url:
    :param proxies: 代理字典
    :return:
    '''
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
            'Cookie': '_abcde_qweasd=0; _abcde_qweasd=0; bdshare_firstime=1577178973028; Hm_lvt_169609146ffe5972484b0957bd1b46d6=1577178973,1577186563,1577186739,1577235413; BAIDU_SSP_lcr=https://www.baidu.com/link?url=AvLJGcMiHKBXi90P2T0xOluezhPz2PeeTLAbP75dmma&wd=&eqid=e131d391001338d8000000025e02b3d2; Hm_lpvt_169609146ffe5972484b0957bd1b46d6=1577235422',
            'Referer': 'http://www.xbiquge.la/'
        }
        response = requests.get(url, headers=headers, proxies=proxies)
        return etree.HTML(response.content.decode('utf-8'))
    except Exception:
        new_proxies = get_proxies()
        print('更换{}代理ip！'.format(new_proxies))
        return get_xpath_by_requests(url,new_proxies)
def get_text(text):
    if text:
        return text[0]
    return ''
def write_to_txt(text,book_name):
    filename = './book/'+book_name
    dirname = os.path.dirname(filename)
    if not os.path.exists(dirname):
        os.mkdir(dirname)
    with open(filename, 'a+', encoding='utf-8') as fp:
        fp.write(text)
def parse_chapter(url):
    url = 'http://www.xbiquge.la'+url
    html = get_xpath_by_requests(url,proxies)
    chapter_name = get_text(html.xpath('//div[@class="bookname"]/h1/text()'))
    book_name = get_text(html.xpath('//div[@class="con_top"]/a[last()]/text()'))
    # print(chapter_name,book_name)
    contents = html.xpath('//div[@id="content"]/text()')
    # print(type(contents))
    # content = ''
    content = ''.join(contents)
    text = chapter_name+r'\n'+content
    write_to_txt(text,book_name)
    # print(url)
    # print(''.join(contents))

def parse_novel(url):
    #获取页面xpath对象
    html = get_xpath_by_requests(url,proxies)
    chapters =html.xpath('//div[@id="list"]/dl/dd/a/@href')
    # print(chapters)
    for chapter in chapters:
        parse_chapter(chapter)
def main():
    base_url = 'http://www.xbiquge.la/xuanhuanxiaoshuo/'
    html = get_xpath_by_requests(base_url,proxies)
    novel_urls = html.xpath('//span[@class="s2"]/a/@href')
    # print(novel_urls)
    for url in novel_urls:
        parse_novel(url)




if __name__ == '__main__':
    proxies  =get_proxies()
    print('使用{}代理ip中！'.format(proxies))
    main()

D_dalei

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬虫 --单线程爬取笔趣网的小说【基础版】

import requestsfrom lxml import etreeimport osdef get_proxies(): try: response = requests.get('http://localhost:5000/get') proxy = response.text proxies = { ...
复制链接

扫一扫

专栏目录