python修改网页内容_获取网页内容生成html,并将某些标签属性进行修改 (基于python3.6)...

#!/usr/bin/python3

# -*- coding: utf-8 -*-

import urllib.request

import os

from bs4 import BeautifulSoup

# 网址

url =

# 更换部分

Splicing =

def get_web(get_url):

page = urllib.request.urlopen(get_url)

html = page.read().decode("utf-8")

all_url = []

url_list = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')

for list_div in url_list.find_all('div', class_='col_menu_con'):

for a in list_div.find_all('a', href=True):

if a.get_text(strip=True):

if 'https' in a['href']:

continue

elif 'http' in a['href']:

continue

else:

all_url.append(a['href'])

for want_url in all_url:

jump_url = Splicing + want_url

name_split = want_url.split('/')

file_name = name_split[1] + '.html'

down_page = urllib.request.urlopen(jump_url)

down_html = down_page.read()

write_html = open(file_name, "w+b")

write_html.write(down_html)

write_html.close()

print(file_name + ' ' + 'done!')

def change_web(html_file):

file = html_file

content = open(file, 'r', encoding="utf-8")

html_cont = content.read()

find_content = BeautifulSoup(html_cont, 'lxml')

# 修改

for change_a in find_content.find_all('a', href=True):

change_a.get_text(strip=True)

if 'https' in change_a['href']:

continue

elif 'http' in change_a['href']:

continue

else:

change_href = Splicing + change_a['href']

change_a['href'] = change_href

# 修改

for change_link in find_content.find_all('link', href=True):

change_link.get_text(strip=True)

if 'https' in change_link['href']:

continue

elif 'http' in change_link['href']:

continue

else:

change_linkhref = Splicing + change_link['href']

change_link['href'] = change_linkhref

# 修改

change_script.get_text(strip=True)

if 'https' in change_script['src']:

continue

elif 'http' in change_script['src']:

continue

else:

change_src = Splicing + change_script['src']

change_script['src'] = change_src

# 修改

for change_form in find_content.find_all('form', action=True):

change_form.get_text(strip=True)

if 'https' in change_form['action']:

continue

elif 'http' in change_form['action']:

continue

else:

change_action = Splicing + change_form['action']

change_form['action'] = change_action

# 修改

for change_image in find_content.find_all('img', src=True):

change_image.get_text(strip=True)

if 'https' in change_image['src']:

continue

elif 'http' in change_image['src']:

continue

else:

change_imagesrc = Splicing + change_image['src']

change_image['src'] = change_imagesrc

# 修改

for change_originalsrc in find_content.find_all('img', original_src=True):

change_originalsrc.get_text(strip=True)

if 'https' in change_originalsrc['original_src']:

continue

elif 'http' in change_originalsrc['original_src']:

continue

else:

change_original = Splicing + change_originalsrc['original_src']

change_originalsrc['original_src'] = change_original

change_content = str(find_content).encode(encoding='utf-8')   #尤其注意,soup生成了字典,进行修改后要转为str,并将其固定utf-8编码,才能存回去

change_html = open(file, "w+b")

change_html.write(change_content)

change_html.close()

print(file + ' ' + 'changed!')

get_web(url)

filearray = []

file_list = os.listdir(os.getcwd())

for fileNAME in file_list:

if os.path.splitext(fileNAME)[1] == '.html':

filearray.append(fileNAME)

for html_number in range(len(filearray)):

change_web(filearray[html_number])

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值