python爬取网页文字_用python爬取网页文本内容，让你快速浏览网站内容

最新推荐文章于 2024-05-17 08:30:00 发布

weixin_39586265

最新推荐文章于 2024-05-17 08:30:00 发布

阅读量1.3k

点赞数

文章标签： python爬取网页文字

大家好，我是咿哑呀。今天我教大家学习网络爬虫，这个爬虫能够爬取网页文本内容，提高大家浏览网页的速度。下面我把代码贴出，大家仔细研究：

from urllib import request, parse

from urllib.parse import quote

import string

import chardet

from bs4 import BeautifulSoup

import re

import time

class spider:

def __init__(self, my_root_url, title_tag, con_tag, OutputFile):

self.new_urls = set() # 待爬取的url

self.old_urls = set() # 已爬取的url

self.datas = [] # 存放搜集的数据

self.add_new_url(my_root_url)

count = 1

while self.has_new_url():

try:

new_url = self.get_new_url()

print('%d、爬取 %s' % (count, new_url))

html_context = self.download(new_url)

new_urls, new_data = self.get_new_urls_data(new_url, html_context, title_tag, con_tag)

self.add_new_urls(new_urls)

self.output_html(new_data, OutputFile)

count += 1

time.sleep(1)

except():

print("爬取失败")

def download(self, new_url):

if new_url is None:

return None

headers = ("User-Agent",

"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063")

opener = request.build_opener()

opener.addheaders = [headers]

request.install_opener(opener)

url_ = quote(new_url, safe=string.printable)

if request.urlopen(url_).getcode() != 200:

return None

else:

html = request.urlopen(url_).read()

charset = chardet.detect(html)['encoding']

return html.decode(charset, 'ignore')

def output_html(self, new_data, OutputFile):

if new_data is None:

return

self.datas.append(new_data)

for data in self.datas:

if data['title'] and data['con'] and data['url']:

fout = open(OutputFile+'\\'+data['title'].replace(' ', '').replace('\n','')+'.txt', 'w', encoding='utf8')

fout.write('%s' % data['title'].replace(' ', '').replace('\n',''))

fout.write('%s' % data['con'].replace(' ', ''))

fout.write('(来源:%s)' % data['url'])

fout.close()

def get_new_urls_data(self, page_url, html_context, title_tag, con_tag):

if page_url is None or html_context is None:

return

new_urls = set()

red_data = {}

soup = BeautifulSoup(html_context, "html.parser")

pat = re.compile('.htm|.asp')

links = soup.find_all(href=pat)

for link in links:

if page_url not in link["href"]:

new_url = link["href"]

new_full_url = parse.urljoin(page_url, new_url)

new_urls.add(new_full_url)

red_data['url'] = page_url

if soup.find(class_=title_tag[0]):

title_node = soup.find(class_=title_tag[0])

if title_node.get_text():

red_data['title'] = title_node.get_text()

else:

red_data['title'] = ""

else:

red_data['title'] = ""

con_node = soup.find(class_=con_tag[0])

if con_node:

red_data['con'] = con_node.get_text()

else:

red_data['con'] = ""

return new_urls, red_data

def add_new_url(self, my_root_url):

if my_root_url is None:

return

if my_root_url not in self.new_urls and my_root_url not in self.old_urls:

self.new_urls.add(my_root_url)

def add_new_urls(self, urls):

if urls is None or len(urls) == 0:

return

for url in urls:

self.add_new_url(url)

def has_new_url(self):

return len(self.new_urls) != 0

def get_new_url(self):

new_url = self.new_urls.pop()

self.old_urls.add(new_url)

return new_url

if __name__ == "__main__":

root_url = "http://www.******.cn"

...

标题式样

title_tags = ['page_bt']

正文式样

con_tags = ['page_wznr_2']

output_dir = 'out'

spider(root_url, title_tags, con_tags, output_dir)

运行这个脚本后，爬取的内容就全部在在目录out下啦。如果这篇文章对你有用，请点赞、转发、收藏、评论，如果喜欢我的文章，请搜索并关注“咿哑呀”，我将分享更多文章给大家。

本文仅代表作者个人观点，不代表SEO研究协会网官方发声，对观点有疑义请先联系作者本人进行修改，若内容非法请联系平台管理员，邮箱cxb5918@163.com。更多相关资讯，请到SEO研究协会网www.seoxiehui.cn学习互联网营销技术请到巨推学院www.jutuiedu.com。

weixin_39586265

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
python爬取网页文字_用python爬取网页文本内容，让你快速浏览网站内容

大家好，我是咿哑呀。今天我教大家学习网络爬虫，这个爬虫能够爬取网页文本内容，提高大家浏览网页的速度。下面我把代码贴出，大家仔细研究：from urllib import request, parsefrom urllib.parse import quoteimport stringimport chardetfrom bs4 import BeautifulSoupimport reimport...
复制链接

扫一扫