代码
import os
import re
import requests
import urllib
import json
from bs4 import BeautifulSoup as BS
class urlmanager(object):
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_urls(self, urls):
for url in urls:
self.add_new_url(url)
def add_new_url(self, url):
if url not in self.old_urls:
self.new_urls.add(url)
def has_new_url(self):
if len(self.new_urls)>0:
return True
else:
return False
def get_new_url(self):
url = self.new_urls.pop()
self.old_urls.add(url)
return url
class htmldownloader(object):
def download(self,url):
if url is None:
return None
User_Agent = 'Mozilla/4.0(compatible; Chrome 80.0.3987.116; Windows 10)'
headers = {'User-Agent': User_Agent}
r = requests.get(url,headers = headers)
if r.status_code == 200:
r.encoding = 'utf-8'
return r.text
return None
class htmlparser(object):
def pre_parse(self, soup, url):
self.soup = soup
self.url = url
def get_link(self):
links = []
urls = []
links = self.soup.find_all('a',class_='j_th_tit',title=re.compile(r'.*\d+.*'))
for link in links:
url = urllib.parse.urljoin(self.url,link['href'])
urls.append(url)
return urls
def get_pic(self):
pattern = re.compile(r'\.|\s|\-|"|"|\(|\)')
pics = []
title = self.soup.find('h3').string
title = pattern.sub('',title)
links = self.soup.find_all('img',class_='BDE_Image',src=re.compile(r'http\://(?:tiebapic)|(?:imgsa)\.baidu\.com/.*\.jpg'))
for link in links:
pic = link['src']
pics.append(pic)
dic = {'title': title, 'pics':pics}
return dic
class datastore(object):
def __init__(self, path='./manhua/'):
self.path = path
if not os.path.exists(path):
os.mkdir(path)
def store(self, dic):
if not os.path.exists(self.path+dic['title']):
os.mkdir(self.path+dic['title'])
for i in range(len(dic['pics'])):
pics = dic['pics']
urllib.request.urlretrieve(pics[i],self.path+dic['title']+'/'+str(i)+'.jpg',self.schedule)
def schedule(self, blocknum,blocksize, totalsize):
if totalsize<0:
totalsize = 50*8192
per = 100.0*blocknum*blocksize/totalsize
if per>100:
per = 100.0
print('已下载:{}*{},文件大小:{};当前下载进度:{}>{:.3f}%'.format(blocknum,blocksize, totalsize,'-'*int(per/10),per), end='\r')
class highcontrol(object):
def __init__(self):
self.manager = urlmanager()
self.download = htmldownloader()
self.parse = htmlparser()
self.store = datastore()
def crawl(self):
for i in range(3,5):
if i==0:
self.manager.add_new_url('https://tieba.baidu.com/f?kw=&ie=utf-8&tab=good&cid=1')
else:
self.manager.add_new_url('https://tieba.baidu.com/f?kw=&ie=utf-8&tab=good&cid=1&pn='+str(i*50))
url = self.manager.get_new_url()
text = self.download.download(url)
text = text.replace('</html>','')
text = text.replace('</body>','')
soup = BS(text)
code = soup.find_all('code',class_='pagelet_html',id='pagelet_html_frs-list/pagelet/thread_list')
soup = BS(code[0].string)
self.parse.pre_parse(soup,url)
urls = self.parse.get_link()
self.manager.add_new_urls(urls)
while self.manager.has_new_url():
url = self.manager.get_new_url()
text = self.download.download(url)
soup = BS(text)
self.parse.pre_parse(soup,url)
dic = self.parse.get_pic()
self.store.store(dic)
hc = highcontrol()
hc.crawl()