大家好,我是咿哑呀。今天我教大家学习网络爬虫,这个爬虫能够爬取网页文本内容,提高大家浏览网页的速度。下面我把代码贴出,大家仔细研究:
from urllib import request, parse
from urllib.parse import quote
import string
import chardet
from bs4 import BeautifulSoup
import re
import time
class spider:
def __init__(self, my_root_url, title_tag, con_tag, OutputFile):
self.new_urls = set() # 待爬取的url
self.old_urls = set() # 已爬取的url
self.datas = [] # 存放搜集的数据
self.add_new_url(my_root_url)
count = 1
while self.has_new_url():
try:
new_url = self.get_new_url()
print('%d、爬取 %s' % (count, new_url))
html_context = self.download(new_url)
new_urls, new_data = self.get_new_urls_data(new_url, html_context, title_tag, con_tag)
self.add_new_urls(new_urls)
self.output_html(new_data, OutputFile)
count += 1
time.sleep(1)
except():
print("爬取失败")
def download(self, new_url):
if new_url is None:
return None
headers = ("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063")
opener = request.build_opener()
opener.addheaders = [headers]
request.install_opener(opener)
url_ = quote(new_url, safe=string.printable)
if request.urlopen(url_).getcode() != 200:
return None
else:
html = request.urlopen(url_).read()
charset = chardet.detect(html)['encoding']
return html.decode(charset, 'ignore')
def output_html(self, new_data, OutputFile):
if new_data is None:
return
self.datas.append(new_data)
for data in self.datas:
if data['title'] and data['con'] and data['url']:
fout = open(OutputFile+'\\'+data['title'].replace(' ', '').replace('\n','')+'.txt', 'w', encoding='utf8')
fout.write('%s' % data['title'].replace(' ', '').replace('\n',''))
fout.write('%s' % data['con'].replace(' ', ''))
fout.write('(来源:%s)' % data['url'])
fout.close()
def get_new_urls_data(self, page_url, html_context, title_tag, con_tag):
if page_url is None or html_context is None:
return
new_urls = set()
red_data = {}
soup = BeautifulSoup(html_context, "html.parser")
pat = re.compile('.htm|.asp')
links = soup.find_all(href=pat)
for link in links:
if page_url not in link["href"]:
new_url = link["href"]
new_full_url = parse.urljoin(page_url, new_url)
new_urls.add(new_full_url)
red_data['url'] = page_url
if soup.find(class_=title_tag[0]):
title_node = soup.find(class_=title_tag[0])
if title_node.get_text():
red_data['title'] = title_node.get_text()
else:
red_data['title'] = ""
else:
red_data['title'] = ""
con_node = soup.find(class_=con_tag[0])
if con_node:
red_data['con'] = con_node.get_text()
else:
red_data['con'] = ""
return new_urls, red_data
def add_new_url(self, my_root_url):
if my_root_url is None:
return
if my_root_url not in self.new_urls and my_root_url not in self.old_urls:
self.new_urls.add(my_root_url)
def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
if __name__ == "__main__":
root_url = "http://www.******.cn"
#
...
标题式样title_tags = ['page_bt']
#
正文式样con_tags = ['page_wznr_2']
output_dir = 'out'
spider(root_url, title_tags, con_tags, output_dir)
运行这个脚本后,爬取的内容就全部在在目录out下啦。如果这篇文章对你有用,请点赞、转发、收藏、评论,如果喜欢我的文章,请搜索并关注“咿哑呀”,我将分享更多文章给大家。
本文仅代表作者个人观点,不代表SEO研究协会网官方发声,对观点有疑义请先联系作者本人进行修改,若内容非法请联系平台管理员,邮箱cxb5918@163.com。更多相关资讯,请到SEO研究协会网www.seoxiehui.cn学习互联网营销技术请到巨推学院www.jutuiedu.com。