python爬取网页文字_用python爬取网页文本内容,让你快速浏览网站内容

大家好,我是咿哑呀。今天我教大家学习网络爬虫,这个爬虫能够爬取网页文本内容,提高大家浏览网页的速度。下面我把代码贴出,大家仔细研究:

from urllib import request, parse

from urllib.parse import quote

import string

import chardet

from bs4 import BeautifulSoup

import re

import time

182355bex5dpbbxx5x55pa.jpg

class spider:

def __init__(self, my_root_url, title_tag, con_tag, OutputFile):

self.new_urls = set() # 待爬取的url

self.old_urls = set() # 已爬取的url

self.datas = [] # 存放搜集的数据

self.add_new_url(my_root_url)

count = 1

while self.has_new_url():

try:

new_url = self.get_new_url()

print('%d、爬取 %s' % (count, new_url))

html_context = self.download(new_url)

new_urls, new_data = self.get_new_urls_data(new_url, html_context, title_tag, con_tag)

self.add_new_urls(new_urls)

self.output_html(new_data, OutputFile)

count += 1

time.sleep(1)

except():

print("爬取失败")

def download(self, new_url):

if new_url is None:

return None

headers = ("User-Agent",

"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063")

opener = request.build_opener()

opener.addheaders = [headers]

request.install_opener(opener)

url_ = quote(new_url, safe=string.printable)

if request.urlopen(url_).getcode() != 200:

return None

else:

html = request.urlopen(url_).read()

charset = chardet.detect(html)['encoding']

return html.decode(charset, 'ignore')

182355cg5gi9ujz9wb9kaw.jpg

def output_html(self, new_data, OutputFile):

if new_data is None:

return

self.datas.append(new_data)

for data in self.datas:

if data['title'] and data['con'] and data['url']:

fout = open(OutputFile+'\\'+data['title'].replace(' ', '').replace('\n','')+'.txt', 'w', encoding='utf8')

fout.write('%s' % data['title'].replace(' ', '').replace('\n',''))

fout.write('%s' % data['con'].replace(' ', ''))

fout.write('(来源:%s)' % data['url'])

fout.close()

def get_new_urls_data(self, page_url, html_context, title_tag, con_tag):

if page_url is None or html_context is None:

return

new_urls = set()

red_data = {}

soup = BeautifulSoup(html_context, "html.parser")

pat = re.compile('.htm|.asp')

links = soup.find_all(href=pat)

for link in links:

if page_url not in link["href"]:

new_url = link["href"]

new_full_url = parse.urljoin(page_url, new_url)

new_urls.add(new_full_url)

red_data['url'] = page_url

if soup.find(class_=title_tag[0]):

title_node = soup.find(class_=title_tag[0])

if title_node.get_text():

red_data['title'] = title_node.get_text()

else:

red_data['title'] = ""

else:

red_data['title'] = ""

con_node = soup.find(class_=con_tag[0])

if con_node:

red_data['con'] = con_node.get_text()

else:

red_data['con'] = ""

return new_urls, red_data

182356e5ae9s56n39fnz5s.jpg

def add_new_url(self, my_root_url):

if my_root_url is None:

return

if my_root_url not in self.new_urls and my_root_url not in self.old_urls:

self.new_urls.add(my_root_url)

def add_new_urls(self, urls):

if urls is None or len(urls) == 0:

return

for url in urls:

self.add_new_url(url)

def has_new_url(self):

return len(self.new_urls) != 0

def get_new_url(self):

new_url = self.new_urls.pop()

self.old_urls.add(new_url)

return new_url

182356du8mur9187027zrm.jpg

if __name__ == "__main__":

root_url = "http://www.******.cn"

#

...

标题式样

title_tags = ['page_bt']

#

正文式样

con_tags = ['page_wznr_2']

output_dir = 'out'

spider(root_url, title_tags, con_tags, output_dir)

运行这个脚本后,爬取的内容就全部在在目录out下啦。如果这篇文章对你有用,请点赞、转发、收藏、评论,如果喜欢我的文章,请搜索并关注“咿哑呀”,我将分享更多文章给大家。

本文仅代表作者个人观点,不代表SEO研究协会网官方发声,对观点有疑义请先联系作者本人进行修改,若内容非法请联系平台管理员,邮箱cxb5918@163.com。更多相关资讯,请到SEO研究协会网www.seoxiehui.cn学习互联网营销技术请到巨推学院www.jutuiedu.com。

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值