多线程爬虫(缓存)& 交互式 & pool.map传递多参数 & 删除换行符

优化代码

import requests
from lxml import etree
from multiprocessing.dummy import Pool

####获取网页信息####
def get_dow(url):
        ip = {
                }
        headers = {
                'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
                }
        req = requests.get(url,headers=headers,proxies=ip)
        req.encoding = 'gbk'
        soup = etree.HTML(req.text)
        return (soup)

####多线程下载章节####
def cache_download(url):
	dictname = url[-13:-5]
	chapter_url = prefix+url
	soup = get_dow(chapter_url)
	content_name = soup.xpath('/html/body/div[4]/text()')[0]
	content_text = soup.xpath('//*[@id="content"]/text()')
	content_text = ''.join(content_text)
	dict[dictname] = [content_name,content_text]
	print(content_name)

####顺序合并多线程下载章节####
def cache_text(url):
	dictname = url[-13:-5]
	content_name = dict[dictname][0]
	content_text = dict[dictname][1]
	with open(name+'.txt','a',encoding='utf-8') as f:
		f.write(content_name+'\n')
		f.write(content_text+'\n')
		
####开始####
if __name__ == '__main__':
	print('仅支持:http://www.62ma.com'+'\n')
	dict = {}
	a = input('输入要下载的链接码:')
	b = len(str(a))
	if b > 5:
		c=str(a)[0:3]
	else:
		c=str(a)[0:2]
	target='http://www.62ma.com/s/'+c+'_'+a+'/'
	prefix = 'http://www.62ma.com'
	soup = get_dow(target)
	chapter_url_list = soup.xpath('/html/body/div[1]/ul//span/a/@href')
	name = str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]
	print('\n'+'你须要下载的小说是:'+name+'\n')
	chapter_unm = int(input('请输入开始章节:')) -1
	thread = int(input('\n'+'要启动多少线程下载:'))
	pool = Pool(thread)
	pool.imap(cache_download,chapter_url_list[chapter_unm:])
	pool.close()
	pool.join()
	pool = Pool(1)
	pool.imap(cache_text,chapter_url_list[chapter_unm:])
	pool.close()
	pool.join()
	print('\n'+'....下载完成....')

多线程爬虫(内存缓存)&交互式

import requests
import os
from lxml import etree
from multiprocessing.dummy import Pool

'''
更换网站须要更改的几个地方:

字符集 GBK  更改  UTF-8

__main__
1. 	print('仅支持:http://www.62ma.com'+'\n')
2. 	target='http://www.62ma.com/s/'+c+'_'+a+'/'
3.	list_tag = soup.xpath('/html/body/div[1]/ul/span')
4.	name = str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]

cache_chapter & get_chapter
1.	chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]

cache_download & get_download
1.	content_name = tree.xpath('/html/body/div[4]/text()')[0]
2.	content_text = tree.xpath('//*[@id="content"]/text()')

'''

name = None
dict = {}

####缓存下载--章节列表####
def cache_chapter(list_tag,chapter_unm,pool_num):
	pool = Pool(pool_num)
	chapter_url_list = []
	for dd_tag in list_tag:
		chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
		chapter_url_list.append(chapter_url)
	if pool_num == 88:
		pool.imap(cache_download,chapter_url_list[chapter_unm:])
		print('\n'+'正在应用缓存下载中。。。')
		pool.close()
		pool.join()
	elif pool_num == 1:
		pool.imap(cache_text,chapter_url_list[chapter_unm:])
		pool.close()
		pool.join()

####缓存下载--多线程下载章节####
def cache_download(url):
	dictname = url[-13:-5]
	chapter_req = requests.get(url)
	chapter_req.encoding = 'gbk'
	tree = etree.HTML(chapter_req.text)
	content_name = tree.xpath('/html/body/div[4]/text()')[0]
	content_text = tree.xpath('//*[@id="content"]/text()')
	content_text = ''.join(content_text)
	dict[dictname] = [content_name,content_text]
	print(content_name)


####缓存下载--顺序合并多线程下载章节####
def cache_text(url):
	dictname = url[-13:-5]
	content_name = dict[dictname][0]
	content_text = dict[dictname][1]
	with open('./bak','a',encoding='utf-8') as f:
		f.write(content_name+'\n')
		f.write(content_text+'\n')

####正常下载--章节列表####
def get_chapter(list_tag,chapter_unm,pool_num):
	pool = Pool(pool_num)
	chapter_url_list = []
	for dd_tag in list_tag:
		chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
		chapter_url_list.append(chapter_url)
	pool.imap(get_download,chapter_url_list[chapter_unm:])
	pool.close()
	pool.join()

####正常下载--顺序下载章节####
def get_download(url):
	chapter_req = requests.get(url)
	chapter_req.encoding = 'gbk'
	tree = etree.HTML(chapter_req.text)
	content_name = tree.xpath('/html/body/div[4]/text()')[0]
	content_text = tree.xpath('//*[@id="content"]/text()')
	content_text = ''.join(content_text)
	print(content_name)
	with open('./bak','a',encoding='utf-8') as f:
		f.write(content_name+'\n')
		f.write(content_text+'\n')

####清洗换行符####
def clearBlankLine():
	file1 = open('./bak', 'r', encoding='utf-8')
	file2 = open(name+'.txt', 'w', encoding='utf-8')
	try:
		for line in file1.readlines():
			if line == '\n':
				line = line.strip('\n')
			file2.write(line)
	finally:
		file1.close()
		file2.close()
	os.remove('./bak')

####开始####
if __name__ == '__main__':
	print('仅支持:http://www.62ma.com'+'\n')
	a = input('输入要下载的链接码:')
	b = len(str(a))
	if b > 5:
		c=str(a)[0:3]
	else:
		c=str(a)[0:2]
	target='http://www.62ma.com/s/'+c+'_'+a+'/'
	req=requests.get(url=target)
	req.encoding = 'gbk'
	soup = etree.HTML(req.text)
	list_tag = soup.xpath('/html/body/div[1]/ul/span')
	name = str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]
	print('\n'+'你须要下载的小说是:'+name+'\n')
	chapter_unm = int(input('请输入开始章节:')) -1
	print('\n'+'是否须要缓存技术?')
	cacheif = int(input('是:1   否:2   :'))
	if cacheif == 1:
		pool_num = 88
		cache_chapter(list_tag,chapter_unm,pool_num)
		pool_num = 1
		cache_chapter(list_tag,chapter_unm,pool_num)
	else:
		pool_num = 1
		get_chapter(list_tag,chapter_unm,pool_num)
	clearBlankLine()
	print('\n'+'....下载完成....')

多线程爬虫(文件缓存)&交互式

import os,shutil
import requests
from lxml import etree
from multiprocessing.dummy import Pool

name = None

####缓存下载--章节列表####
def cache_chapter(list_tag,chapter_unm,pool_num):
	pool = Pool(pool_num)
	chapter_url_list = []
	for dd_tag in list_tag:
		chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
		chapter_url_list.append(chapter_url)
	if pool_num == 88:
		pool.imap(cache_download,chapter_url_list[chapter_unm:])
		print('\n'+'正在应用缓存下载中。。。')
		pool.close()
		pool.join()
	elif pool_num == 1:
		pool.imap(cache_text,chapter_url_list[chapter_unm:])
		pool.close()
		pool.join()

####缓存下载--多线程下载章节####
def cache_download(url):
	dictname = url[-13:-5]
	chapter_req = requests.get(url)
	chapter_req.encoding = 'gbk'
	tree = etree.HTML(chapter_req.text)
	content_name = tree.xpath('/html/body/div[4]/text()')[0]
	content_text = tree.xpath('//*[@id="content"]/text()')
	content_text = ''.join(content_text)
	print(content_name)
	with open('./cache/'+dictname,'w',encoding='utf-8') as f:
		f.write(content_name+'\n')
		f.write(content_text+'\n')

####缓存下载--顺序合并多线程下载章节####
def cache_text(url):
	dictname = url[-13:-5]
	file1 = open('./cache/'+dictname, 'r', encoding='utf-8')
	file2 = open(name+'.txt', 'a', encoding='utf-8')
	try:
		for line in file1.readlines():
			if line == '\n':
				line = line.strip('\n')
			file2.write(line)
	finally:
		file1.close()
		file2.close()

####正常下载--章节列表####
def chapter(list_tag,chapter_unm,pool_num):
	pool = Pool(pool_num)
	chapter_url_list = []
	for dd_tag in list_tag:
		chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
		chapter_url_list.append(chapter_url)
	pool.imap(get_download,chapter_url_list[chapter_unm:])
	pool.close()
	pool.join()

####正常下载--顺序下载章节####
def get_download(url):
	chapter_req = requests.get(url)
	chapter_req.encoding = 'gbk'
	tree = etree.HTML(chapter_req.text)
	content_name = tree.xpath('/html/body/div[4]/text()')[0]
	content_text = tree.xpath('//*[@id="content"]/text()')
	content_text = ''.join(content_text)
	print(content_name)
	with open('./bak','a',encoding='utf-8') as f:
		f.write(content_name+'\n')
		f.write(content_text+'\n')

####清洗换行符####
def clearBlankLine():
	file1 = open('bak', 'r', encoding='utf-8')
	file2 = open(name+'.txt', 'w', encoding='utf-8')
	try:
		for line in file1.readlines():
			if line == '\n':
				line = line.strip('\n')
			file2.write(line)
	finally:
		file1.close()
		file2.close()
	os.remove('bak')

####开始####
if __name__ == '__main__':
	print('仅支持:http://www.62ma.com'+'\n')
	a = input('输入要下载的链接码:')
	b = len(str(a))
	if b > 5:
		c=str(a)[0:3]
	else:
		c=str(a)[0:2]
	target='http://www.62ma.com/s/'+c+'_'+a+'/'
	req=requests.get(url=target)
	req.encoding = 'gbk'
	soup = etree.HTML(req.text)
	list_tag = soup.xpath('/html/body/div[1]/ul/span')
	name = str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]
	print('\n'+'你须要下载的小说是:'+name+'\n')
	chapter_unm = int(input('请输入开始章节:')) -1
	print('\n'+'是否须要缓存技术?')
	cacheif = int(input('是:1   否:2   :'))
	if cacheif == 1:
		os.mkdir('./cache')
		pool_num = 88
		cache_chapter(list_tag,chapter_unm,pool_num)
		pool_num = 1
		cache_chapter(list_tag,chapter_unm,pool_num)
		shutil.rmtree('./cache')
	else:
		pool_num = 1
		chapter(list_tag,chapter_unm,pool_num)
		clearBlankLine()
	print('\n'+'....下载完成....')

多篇幅缓存下载(pool.map传递多参数)

import requests
import os
from lxml import etree
from multiprocessing.dummy import Pool
from functools import partial

dict = {}

####缓存下载--章节列表####
def cache_chapter(list_tag,pool_num,name):
	pool = Pool(pool_num)
	chapter_url_list = []
	for dd_tag in list_tag:
		chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
		chapter_url_list.append(chapter_url)
	if pool_num == 88:
		pool.imap(partial(cache_download,name),chapter_url_list)
		pool.close()
		pool.join()
	elif pool_num == 1:
		pool.imap(partial(cache_text,name),chapter_url_list)
		pool.close()
		pool.join()

####缓存下载--多线程下载章节####
def cache_download(name,url):
	dictname = url[-13:-5]
	chapter_req = requests.get(url)
	chapter_req.encoding = 'gbk'
	tree = etree.HTML(chapter_req.text)
	content_name = tree.xpath('/html/body/div[4]/text()')[0]
	content_text = tree.xpath('//*[@id="content"]/text()')
	content_text = ''.join(content_text)
	dict[dictname] = [content_name,content_text]

####缓存下载--顺序合并多线程下载章节####
def cache_text(name,url):
	dictname = url[-13:-5]
	content_name = dict[dictname][0]
	content_text = dict[dictname][1]
	with open(name,'a',encoding='utf-8') as f:
		f.write(content_name+'\n')
		f.write(content_text+'\n')

####清洗换行符####
def clearBlankLine(name):
	file1 = open(name, 'r', encoding='utf-8')
	file2 = open(name+'.txt', 'w', encoding='utf-8')
	try:
		for line in file1.readlines():
			if line == '\n':
				line = line.strip('\n')
			file2.write(line)
	finally:
		file1.close()
		file2.close()
	os.remove(name)
	print('《'+name+'》'+' 下载完成。。。')
	
####开始####
def run(url):
        chapter_url=url.xpath('./a/@href')[0]
        req=requests.get(url=chapter_url)
        req.encoding = 'gbk'
        soup = etree.HTML(req.text)
        list_tag = soup.xpath('/html/body/div[1]/ul/span')
        name = str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]
        pool_num = 88
        cache_chapter(list_tag,pool_num,name)
        pool_num = 1
        cache_chapter(list_tag,pool_num,name)
        clearBlankLine(name)
        
if __name__ == '__main__':
        pool = Pool(8)
        chapter_url_list = []
        target='http://www.62ma.com/top/'
        req=requests.get(url=target)
        req.encoding = 'gbk'
        soup = etree.HTML(req.text)
        list_tag = soup.xpath('//*[@class="nav clearfix"]/span')
        for ss in list_tag:
                chapter_url_list.append(ss)
        pool.imap(run,chapter_url_list)
        pool.close()
        pool.join()

删除换行符

def clearBlankLine():
	file1 = open('bak', 'r', encoding='utf-8')
	file2 = open(name+'.txt', 'w', encoding='utf-8')
	try:
		for line in file1.readlines():
			if line == '\n':
				line = line.strip('\n')
			file2.write(line)
	finally:
		file1.close()
		file2.close()
	os.remove('bak')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值