背景:
之前发了一篇收集素材的,现在来一篇收集素材来源的,因为代码较为简单,为了防止报错导致重新爬取,故这里把爬取数据列表和数据内容分开。(爬取的数据仅用于学习,切勿商业)
代码(获取列表):
import requests,time
from bs4 import BeautifulSoup
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 '
'Safari/537.36 '
}
requests.packages.urllib3.disable_warnings() #避免出现警告信息
response = requests.get(url,headers=headers,verify=False)
if response.status_code == 200: #根据状态码判断是否访问成功
response.encoding = response.apparent_encoding
return response.text
return None
def getList(soup):
pattern = soup.find('div', {'id': 'newscontent'})
pattern = pattern.find('ul')
list = pattern.find_all("span", {"class":{"s2"}});
result = [];
for span in list:
tagA = span.find('a');
result.append("【"+tagA.get_text()+"】 "+tagA['href'])
write(result);
def write(result):
with open('data.txt','a',encoding='utf-8') as f: # 设置文件对象
for index, word in enumerate(result):
print(word)
f.write('\n'+word)
listUrl = 'http://www.biquge.info/list/6_1.html';
if __name__ == '__main__':
html = get_one_page(listUrl)
soup = BeautifulSoup(html, 'lxml')
#获取分页
pagelink = soup.find('div', {'id': 'pagelink'})
last = pagelink.find("a",{"class":"last"}).get_text();
for pageIndex in range(int(last)):
html = 'http://www.biquge.info/list/6_'+str(pageIndex+1)+".html";
html = get_one_page(html)
soup = BeautifulSoup(html, 'lxml')
print('当前是第'+str(pageIndex+1)+'页');
getList(soup);
time.sleep(3)
控制台效果:
txt效果:
爬取完列表后,便开始爬取具体内容:
代码(内容):
import requests,time,re
from bs4 import BeautifulSoup
import bs4
def get_one_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 '
'Safari/537.36 '
}
requests.packages.urllib3.disable_warnings() #避免出现警告信息
response = requests.get(url,headers=headers,verify=False)
if response.status_code == 200: #根据状态码判断是否访问成功
response.encoding = response.apparent_encoding
return response.text
return None
def parse_one_page(html):
soup = BeautifulSoup(html,'lxml')
chapter_name = soup.h1.string
pattern = soup.find('div',{'id':'content'})
next_url = re.search("<a.*?上一章.*?(章节目录|章节列表).*?a href=\"(.*?)\">下一章.*?报错欠更</a>", html, re.S).group(2)
book_list = [] #创建空列表用来存放文本内容
if chapter_name:
book_list.append(chapter_name)
if pattern:
for br in pattern:
if isinstance(br,bs4.element.NavigableString): #文本类型为NavigableString
# print(br.string)
if br.string and br.string != 'go' and br.string != 'over':
book_list.append(br.string)
# print(book_list)
print(chapter_name,' 已经完成下载',' 下一章节:',next_url)
return book_list,next_url
def write(result):
with open('data.txt','a',encoding='utf-8') as f: # 设置文件对象
for index, word in enumerate(result):
print(word)
f.write('\n'+word)
fileNameList = [];
def read():
with open('data.txt','r',encoding='utf-8') as f: # 设置文件对象
for line in f.readlines():
line = line.strip('\n') #去掉列表中每一个元素的换行符
line = re.search(r'【.*】',line)
if line and title:
title = line.group()
title = re.sub(r"(【|】)",'',title)
fileNameList.append(title);
def getContent(bookUrl,url,fileName):
start = time.clock()
print('开始下载,请等待...')
while True:
num = 0
while True:
html = get_one_page(url)
num = num + 1;
if num>10 or html!=None:
break;
content, next_url = parse_one_page(html)
with open('doc/'+fileName+'.txt', 'a+', encoding='utf-8') as f:
for i in content:
f.write(i)
url = next_url
if url == bookUrl:
break
print('下载已完成')
end = time.clock()
print('Running time:%.4s Seconds' % (end - start))
time.sleep(2)
def getFileList():
fileList = [];
with open('data.txt', 'r', encoding='utf-8') as f: # 设置文件对象
for line in f.readlines():
line = line.strip('\n') # 去掉列表中每一个元素的换行符
if line:
fileList.append(line)
return fileList
if __name__ == '__main__':
fileList = getFileList();
for file in fileList:
link = re.search(r'http(.*)', file)
title = re.search(r'【.*】', file)
if link:
title = title.group()
link = link.group()
html = get_one_page(link)
soup = BeautifulSoup(html, 'lxml')
pattern = soup.find('div', {'id': 'list'})
pattern = pattern.find('a');
a = pattern['href']
newHref = link+a;
print(title)
getContent(link,newHref,title)
效果(控制台):