我们需要requests BeautifulSoup python-docx库:
pip install requests
pip install beautifulsoup4
pip install python-docx
import requests
from bs4 import BeautifulSoup
from docx import Document
def get_webpage_text(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
paragraphs = soup.find_all('p') # 根据网页结构调整选择器,提取文章段落
text_content = ''
for paragraph in paragraphs:
text_content += paragraph.get_text() + '\n'
return text_content
def save_as_word(text_content, output_path):
doc