import requests
from lxml import etree
from multiprocessing.dummy import Pool
####获取网页信息####defget_dow(url):
ip ={}
headers ={'user-agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'}
req = requests.get(url,headers=headers,proxies=ip)
req.encoding ='gbk'
soup = etree.HTML(req.text)return(soup)####多线程下载章节####defcache_download(url):
dictname = url[-13:-5]
chapter_url = prefix+url
soup = get_dow(chapter_url)
content_name = soup.xpath('/html/body/div[4]/text()')[0]
content_text = soup.xpath('//*[@id="content"]/text()')
content_text =''.join(content_text)dict[dictname]=[content_name,content_text]print(content_name)####顺序合并多线程下载章节####defcache_text(url):
dictname = url[-13:-5]
content_name =dict[dictname][0]
content_text =dict[dictname][1]withopen(name+'.txt','a',encoding='utf-8')as f:
f.write(content_name+'\n')
f.write(content_text+'\n')####开始####if __name__ =='__main__':print('仅支持:http://www.62ma.com'+'\n')dict={}
a =input('输入要下载的链接码:')
b =len(str(a))if b >5:
c=str(a)[0:3]else:
c=str(a)[0:2]
target='http://www.62ma.com/s/'+c+'_'+a+'/'
prefix ='http://www.62ma.com'
soup = get_dow(target)
chapter_url_list = soup.xpath('/html/body/div[1]/ul//span/a/@href')
name =str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]print('\n'+'你须要下载的小说是:'+name+'\n')
chapter_unm =int(input('请输入开始章节:'))-1
thread =int(input('\n'+'要启动多少线程下载:'))
pool = Pool(thread)
pool.imap(cache_download,chapter_url_list[chapter_unm:])
pool.close()
pool.join()
pool = Pool(1)
pool.imap(cache_text,chapter_url_list[chapter_unm:])
pool.close()
pool.join()print('\n'+'....下载完成....')
多线程爬虫(内存缓存)&交互式
import requests
import os
from lxml import etree
from multiprocessing.dummy import Pool
'''
更换网站须要更改的几个地方:
字符集 GBK 更改 UTF-8
__main__
1. print('仅支持:http://www.62ma.com'+'\n')
2. target='http://www.62ma.com/s/'+c+'_'+a+'/'
3. list_tag = soup.xpath('/html/body/div[1]/ul/span')
4. name = str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]
cache_chapter & get_chapter
1. chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
cache_download & get_download
1. content_name = tree.xpath('/html/body/div[4]/text()')[0]
2. content_text = tree.xpath('//*[@id="content"]/text()')
'''
name =Nonedict={}####缓存下载--章节列表####defcache_chapter(list_tag,chapter_unm,pool_num):
pool = Pool(pool_num)
chapter_url_list =[]for dd_tag in list_tag:
chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
chapter_url_list.append(chapter_url)if pool_num ==88:
pool.imap(cache_download,chapter_url_list[chapter_unm:])print('\n'+'正在应用缓存下载中。。。')
pool.close()
pool.join()elif pool_num ==1:
pool.imap(cache_text,chapter_url_list[chapter_unm:])
pool.close()
pool.join()####缓存下载--多线程下载章节####defcache_download(url):
dictname = url[-13:-5]
chapter_req = requests.get(url)
chapter_req.encoding ='gbk'
tree = etree.HTML(chapter_req.text)
content_name = tree.xpath('/html/body/div[4]/text()')[0]
content_text = tree.xpath('//*[@id="content"]/text()')
content_text =''.join(content_text)dict[dictname]=[content_name,content_text]print(content_name)####缓存下载--顺序合并多线程下载章节####defcache_text(url):
dictname = url[-13:-5]
content_name =dict[dictname][0]
content_text =dict[dictname][1]withopen('./bak','a',encoding='utf-8')as f:
f.write(content_name+'\n')
f.write(content_text+'\n')####正常下载--章节列表####defget_chapter(list_tag,chapter_unm,pool_num):
pool = Pool(pool_num)
chapter_url_list =[]for dd_tag in list_tag:
chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
chapter_url_list.append(chapter_url)
pool.imap(get_download,chapter_url_list[chapter_unm:])
pool.close()
pool.join()####正常下载--顺序下载章节####defget_download(url):
chapter_req = requests.get(url)
chapter_req.encoding ='gbk'
tree = etree.HTML(chapter_req.text)
content_name = tree.xpath('/html/body/div[4]/text()')[0]
content_text = tree.xpath('//*[@id="content"]/text()')
content_text =''.join(content_text)print(content_name)withopen('./bak','a',encoding='utf-8')as f:
f.write(content_name+'\n')
f.write(content_text+'\n')####清洗换行符####defclearBlankLine():
file1 =open('./bak','r', encoding='utf-8')
file2 =open(name+'.txt','w', encoding='utf-8')try:for line in file1.readlines():if line =='\n':
line = line.strip('\n')
file2.write(line)finally:
file1.close()
file2.close()
os.remove('./bak')####开始####if __name__ =='__main__':print('仅支持:http://www.62ma.com'+'\n')
a =input('输入要下载的链接码:')
b =len(str(a))if b >5:
c=str(a)[0:3]else:
c=str(a)[0:2]
target='http://www.62ma.com/s/'+c+'_'+a+'/'
req=requests.get(url=target)
req.encoding ='gbk'
soup = etree.HTML(req.text)
list_tag = soup.xpath('/html/body/div[1]/ul/span')
name =str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]print('\n'+'你须要下载的小说是:'+name+'\n')
chapter_unm =int(input('请输入开始章节:'))-1print('\n'+'是否须要缓存技术?')
cacheif =int(input('是:1 否:2 :'))if cacheif ==1:
pool_num =88
cache_chapter(list_tag,chapter_unm,pool_num)
pool_num =1
cache_chapter(list_tag,chapter_unm,pool_num)else:
pool_num =1
get_chapter(list_tag,chapter_unm,pool_num)
clearBlankLine()print('\n'+'....下载完成....')
多线程爬虫(文件缓存)&交互式
import os,shutil
import requests
from lxml import etree
from multiprocessing.dummy import Pool
name =None####缓存下载--章节列表####defcache_chapter(list_tag,chapter_unm,pool_num):
pool = Pool(pool_num)
chapter_url_list =[]for dd_tag in list_tag:
chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
chapter_url_list.append(chapter_url)if pool_num ==88:
pool.imap(cache_download,chapter_url_list[chapter_unm:])print('\n'+'正在应用缓存下载中。。。')
pool.close()
pool.join()elif pool_num ==1:
pool.imap(cache_text,chapter_url_list[chapter_unm:])
pool.close()
pool.join()####缓存下载--多线程下载章节####defcache_download(url):
dictname = url[-13:-5]
chapter_req = requests.get(url)
chapter_req.encoding ='gbk'
tree = etree.HTML(chapter_req.text)
content_name = tree.xpath('/html/body/div[4]/text()')[0]
content_text = tree.xpath('//*[@id="content"]/text()')
content_text =''.join(content_text)print(content_name)withopen('./cache/'+dictname,'w',encoding='utf-8')as f:
f.write(content_name+'\n')
f.write(content_text+'\n')####缓存下载--顺序合并多线程下载章节####defcache_text(url):
dictname = url[-13:-5]
file1 =open('./cache/'+dictname,'r', encoding='utf-8')
file2 =open(name+'.txt','a', encoding='utf-8')try:for line in file1.readlines():if line =='\n':
line = line.strip('\n')
file2.write(line)finally:
file1.close()
file2.close()####正常下载--章节列表####defchapter(list_tag,chapter_unm,pool_num):
pool = Pool(pool_num)
chapter_url_list =[]for dd_tag in list_tag:
chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
chapter_url_list.append(chapter_url)
pool.imap(get_download,chapter_url_list[chapter_unm:])
pool.close()
pool.join()####正常下载--顺序下载章节####defget_download(url):
chapter_req = requests.get(url)
chapter_req.encoding ='gbk'
tree = etree.HTML(chapter_req.text)
content_name = tree.xpath('/html/body/div[4]/text()')[0]
content_text = tree.xpath('//*[@id="content"]/text()')
content_text =''.join(content_text)print(content_name)withopen('./bak','a',encoding='utf-8')as f:
f.write(content_name+'\n')
f.write(content_text+'\n')####清洗换行符####defclearBlankLine():
file1 =open('bak','r', encoding='utf-8')
file2 =open(name+'.txt','w', encoding='utf-8')try:for line in file1.readlines():if line =='\n':
line = line.strip('\n')
file2.write(line)finally:
file1.close()
file2.close()
os.remove('bak')####开始####if __name__ =='__main__':print('仅支持:http://www.62ma.com'+'\n')
a =input('输入要下载的链接码:')
b =len(str(a))if b >5:
c=str(a)[0:3]else:
c=str(a)[0:2]
target='http://www.62ma.com/s/'+c+'_'+a+'/'
req=requests.get(url=target)
req.encoding ='gbk'
soup = etree.HTML(req.text)
list_tag = soup.xpath('/html/body/div[1]/ul/span')
name =str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]print('\n'+'你须要下载的小说是:'+name+'\n')
chapter_unm =int(input('请输入开始章节:'))-1print('\n'+'是否须要缓存技术?')
cacheif =int(input('是:1 否:2 :'))if cacheif ==1:
os.mkdir('./cache')
pool_num =88
cache_chapter(list_tag,chapter_unm,pool_num)
pool_num =1
cache_chapter(list_tag,chapter_unm,pool_num)
shutil.rmtree('./cache')else:
pool_num =1
chapter(list_tag,chapter_unm,pool_num)
clearBlankLine()print('\n'+'....下载完成....')
多篇幅缓存下载(pool.map传递多参数)
import requests
import os
from lxml import etree
from multiprocessing.dummy import Pool
from functools import partial
dict={}####缓存下载--章节列表####defcache_chapter(list_tag,pool_num,name):
pool = Pool(pool_num)
chapter_url_list =[]for dd_tag in list_tag:
chapter_url='http://www.62ma.com'+dd_tag.xpath('./a/@href')[0]
chapter_url_list.append(chapter_url)if pool_num ==88:
pool.imap(partial(cache_download,name),chapter_url_list)
pool.close()
pool.join()elif pool_num ==1:
pool.imap(partial(cache_text,name),chapter_url_list)
pool.close()
pool.join()####缓存下载--多线程下载章节####defcache_download(name,url):
dictname = url[-13:-5]
chapter_req = requests.get(url)
chapter_req.encoding ='gbk'
tree = etree.HTML(chapter_req.text)
content_name = tree.xpath('/html/body/div[4]/text()')[0]
content_text = tree.xpath('//*[@id="content"]/text()')
content_text =''.join(content_text)dict[dictname]=[content_name,content_text]####缓存下载--顺序合并多线程下载章节####defcache_text(name,url):
dictname = url[-13:-5]
content_name =dict[dictname][0]
content_text =dict[dictname][1]withopen(name,'a',encoding='utf-8')as f:
f.write(content_name+'\n')
f.write(content_text+'\n')####清洗换行符####defclearBlankLine(name):
file1 =open(name,'r', encoding='utf-8')
file2 =open(name+'.txt','w', encoding='utf-8')try:for line in file1.readlines():if line =='\n':
line = line.strip('\n')
file2.write(line)finally:
file1.close()
file2.close()
os.remove(name)print('《'+name+'》'+' 下载完成。。。')####开始####defrun(url):
chapter_url=url.xpath('./a/@href')[0]
req=requests.get(url=chapter_url)
req.encoding ='gbk'
soup = etree.HTML(req.text)
list_tag = soup.xpath('/html/body/div[1]/ul/span')
name =str(soup.xpath('/html/body/div[1]/p/a/@title')[0])[0:-2]
pool_num =88
cache_chapter(list_tag,pool_num,name)
pool_num =1
cache_chapter(list_tag,pool_num,name)
clearBlankLine(name)if __name__ =='__main__':
pool = Pool(8)
chapter_url_list =[]
target='http://www.62ma.com/top/'
req=requests.get(url=target)
req.encoding ='gbk'
soup = etree.HTML(req.text)
list_tag = soup.xpath('//*[@class="nav clearfix"]/span')for ss in list_tag:
chapter_url_list.append(ss)
pool.imap(run,chapter_url_list)
pool.close()
pool.join()
删除换行符
defclearBlankLine():
file1 =open('bak','r', encoding='utf-8')
file2 =open(name+'.txt','w', encoding='utf-8')try:for line in file1.readlines():if line =='\n':
line = line.strip('\n')
file2.write(line)finally:
file1.close()
file2.close()
os.remove('bak')