记一次使用python 爬虫爬取小说

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from concurrent import futures
from multiprocessing import Process
#from concurrent.futures import ProcessPoolExcutor
#from time import time
import time
import re
import random
from collections import deque
from queue import Queue
def chapterUrl_list():
        soup = BeautifulSoup(open('ss.html',encoding='utf-8'),features='html.parser')
        chapterurllist = Queue()
        chapterurls = soup.find('div',attrs={"class":"listmain"}).find_all('a')
        for url in chapterurls:
                if url.get('href') is    None : continue
                if url.get('href').find('9832') < 0 : continue 
                chapterurllist.put(url.get('href'))
        return chapterurllist

def write2txt(result):
	r = result 
	r = r.result()
	r.encoding = r.apparent_encoding
	soup = BeautifulSoup(r.text,"lxml")
	title =    soup.find('h1').get_text()
	contents = soup.find_all(name='div',attrs={'class':'showtxt'})
	for txt in    contents:
	        print('title-----------------------------'+ title)
	        a = '\n'+title+'\n'+ txt.text
	        f = open(r'打更人.txt','a',encoding='utf-8')
	        f.write(a)
	        f.close()

header = [
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'
        ,'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'
        ,'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'
        ,'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'
        ,'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
        ]
headers={'User-Agent':str(header[random.randint(0,4)])}    
session = requests.Session()
session.headers.update(headers)

#1. 获取所有章节的url链接
url_ls = chapterUrl_list()
print('--------------------------------------------------------------------------------------')
print(url_ls.qsize())
print('--------------------------------------------------------------------------------------')

text_ls = Queue()
executor = futures.ThreadPoolExecutor(max_workers=3)
ex = futures.ThreadPoolExecutor(max_workers=1)


#2.遍历url队列, 获取每章的内容
while not url_ls.empty()    :

        ft = executor.submit(session.get,url_ls.get())
        time.sleep(1)
        text_ls.put(ft)
        if not text_ls.empty(): 
        	ex.submit(write2txt,text_ls.get())

ps: 第一次写python 脚本, 嘿嘿嘿嘿, 乱写的.
参考了好多前辈的代码.终于不知道怎么地 他就能运行了.
用了多线程, 然而又好像没有多线程效果.
爬1000章用了接近15分钟…
头蒙, 有机会再添砖加瓦 改进吧.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值