#
import requests
from bs4 import BeautifulSoup
from urllib import request
from lxml import etree
import time
import os
import re
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
}
url = input('请输入抓取网站: ')
cent = requests.get(url).content.decode('utf8')
htm = BeautifulSoup(cent, 'lxml')
name = htm.find_all('a')
def info(url):
cent = requests.get(url).content.decode('utf8')
html = BeautifulSoup(cent, 'lxml')
if '=' in url:
name = url.split(';')[-1].split('=')[1]
else:
name = 'index'
with open('index_' + name + ".html", "w", encoding='utf-8') as file:
file.write(str(html))
for i in name:
h = re.compile(r'href="/.*">').findall(str(i))
if h:
info(url + h[0].split('"')[1].split('/')[1])
抓取页面的保存本地未完成
最新推荐文章于 2024-06-05 13:50:02 发布