python 爬虫爬取小说 单进程与多进程 学习

转载地址

1 单进程:

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys

"""
类说明:下载《笔趣看》网小说《一念永恒》
"""


class downloader(object):

    def __init__(self):
        self.server = 'http://www.biqukan.com/'
        self.target = 'http://www.biqukan.com/1_1094/'
        self.names = []  # 存放章节名
        self.urls = []  # 存放章节链接
        self.nums = 0  # 章节数

    """
    函数说明:获取下载链接

    """

    def get_download_url(self):
        html=requests.get(self.target).text
        bs=BeautifulSoup(html,'html.parser')
        div=bs.find_all('div',class_='listmain')
        bs=BeautifulSoup(str(div[0]),'html.parser')
        list=bs.find_all('a')
        self.nums=len(list[15:])
   #     print(list)
        for each in list[15:]:
            self.names.append(each.string)
            self.urls.append(self.server+each.get('href'))

    def get_content(self,target):
        html=requests.get(url=target).text
        bs=BeautifulSoup(html,'html.parser')
        div=bs.find_all('div',class_='showtxt')
        texts=div[0].text.replace('\xa0' * 8, '\n\n')
        return texts

    def writer(self):
        f=open('一念永恒.txt','a',encoding='utf-8')
        for i in range(self.nums) :
            f.write(self.names[i])
            f.write('\n')
            f.writelines(self.get_content(self.urls[i]))
            f.write('\n\n')
            print(i)
        f.close()
if __name__ == "__main__":
    dl = downloader()
    dl.get_download_url()
    dl.writer()
 #   print('《一年永恒》开始下载:')
 #   print('《一年永恒》下载完成')

2 多进程

注意 dl.names[i] 为string ,pool中 args 不能传递 (我也不知道为什么) 所以加了个str

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests, sys
from multiprocessing import Pool
import string
headers={
'Cookie':r'UM_distinctid=164fd71debe478-0aed9f594fffa9-3c604504-1fa400-164fd71debf4c2; bcolor=; font=; size=; fontcolor=; width=; CNZZDATA1260938422=2084872231-1533260238-%7C1533551125',
'Host':'www.biqukan.com',
#'Referer':r'http://www.biqukan.com/1_1094/',
'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}

class downloader(object):

    def __init__(self):
        self.server = 'http://www.biqukan.com/'
        self.target = 'http://www.biqukan.com/1_1094/'
        self.names = []  # 存放章节名
        self.urls = []  # 存放章节链接
        self.nums = 0  # 章节数
    def get_download_url(self):
        req = requests.get(url=self.target,headers=headers)
        html = req.text
        div_bf = BeautifulSoup(html,"html.parser")
        div = div_bf.find_all('div', class_='listmain')
        a_bf = BeautifulSoup(str(div[0]),"html.parser")
        a = a_bf.find_all('a')
        self.nums = len(a[15:])  # 剔除不必要的章节,并统计章节数
        for each in a[15:]:
            self.names.append(each.string)
            self.urls.append(self.server + each.get('href'))


def get_contents(target):
    req = requests.get(url=target)
    html = req.text
    bf = BeautifulSoup(html,"html.parser")
    texts = bf.find_all('div', class_='showtxt')
    texts = texts[0].text.replace('\xa0' * 8, '\n\n')
    return texts

def writer(name,path,texts):
    ans=get_contents(texts)

    with open(path, 'a', encoding='utf-8') as f:
        f.write(name + '\n')
        f.writelines(ans)
        f.write('\n\n')


if __name__ == "__main__":
    pool = Pool(15)
    dl = downloader()
    dl.get_download_url()


    print('《一年永恒》开始下载:')

    for i in range(dl.nums):
        #print('1',dl.get_contents('http://www.biqukan.com/1_1094/5403177.html'))
        pool.apply_async(writer,args=(str(dl.names[i]),'一念永恒.txt',dl.urls[i]))
        #pool.apply_async(writer, args=(str(dl.names[i]), '一念永恒.txt', dl.urls[i]))

    pool.close()
    pool.join()
    print('《一年永恒》下载完成')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值