python修改网页内容_获取网页内容生成html，并将某些标签属性进行修改 (基于python3.6)...

weixin_39846898

于 2020-12-21 04:34:51 发布

阅读量860

点赞数

文章标签： python修改网页内容

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/weixin_39846898/article/details/111764793

版权

#!/usr/bin/python3

# -*- coding: utf-8 -*-

import urllib.request

import os

from bs4 import BeautifulSoup

# 网址

url =

# 更换部分

Splicing =

def get_web(get_url):

page = urllib.request.urlopen(get_url)

html = page.read().decode("utf-8")

all_url = []

url_list = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')

for list_div in url_list.find_all('div', class_='col_menu_con'):

for a in list_div.find_all('a', href=True):

if a.get_text(strip=True):

if 'https' in a['href']:

continue

elif 'http' in a['href']:

continue

else:

all_url.append(a['href'])

for want_url in all_url:

jump_url = Splicing + want_url

name_split = want_url.split('/')

file_name = name_split[1] + '.html'

down_page = urllib.request.urlopen(jump_url)

down_html = down_page.read()

write_html = open(file_name, "w+b")

write_html.write(down_html)

write_html.close()

print(file_name + ' ' + 'done!')

def change_web(html_file):

file = html_file

content = open(file, 'r', encoding="utf-8")

html_cont = content.read()

find_content = BeautifulSoup(html_cont, 'lxml')

# 修改

for change_a in find_content.find_all('a', href=True):

change_a.get_text(strip=True)

if 'https' in change_a['href']:

elif 'http' in change_a['href']:

change_href = Splicing + change_a['href']

change_a['href'] = change_href

for change_link in find_content.find_all('link', href=True):

change_link.get_text(strip=True)

if 'https' in change_link['href']:

elif 'http' in change_link['href']:

change_linkhref = Splicing + change_link['href']

change_link['href'] = change_linkhref

change_script.get_text(strip=True)

if 'https' in change_script['src']:

elif 'http' in change_script['src']:

change_src = Splicing + change_script['src']

change_script['src'] = change_src

for change_form in find_content.find_all('form', action=True):

change_form.get_text(strip=True)

if 'https' in change_form['action']:

elif 'http' in change_form['action']:

change_action = Splicing + change_form['action']

change_form['action'] = change_action

for change_image in find_content.find_all('img', src=True):

change_image.get_text(strip=True)

if 'https' in change_image['src']:

elif 'http' in change_image['src']:

change_imagesrc = Splicing + change_image['src']

change_image['src'] = change_imagesrc

for change_originalsrc in find_content.find_all('img', original_src=True):

change_originalsrc.get_text(strip=True)

if 'https' in change_originalsrc['original_src']:

elif 'http' in change_originalsrc['original_src']:

change_original = Splicing + change_originalsrc['original_src']

change_originalsrc['original_src'] = change_original

change_content = str(find_content).encode(encoding='utf-8') #尤其注意，soup生成了字典，进行修改后要转为str，并将其固定utf-8编码，才能存回去

change_html = open(file, "w+b")

change_html.write(change_content)

change_html.close()

print(file + ' ' + 'changed!')

file_list = os.listdir(os.getcwd())

for fileNAME in file_list:

if os.path.splitext(fileNAME)[1] == '.html':

filearray.append(fileNAME)

for html_number in range(len(filearray)):

change_web(filearray[html_number])

weixin_39846898

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。