认识个在学校电台的朋友每天都要在网上找新闻, 国际, 国内, 和校内各五篇, 然后将其做成word文档打印, 遂写了这个python程序实现这个功能.
程序用python3.4编写, 使用到了urllib, 加另外两个三方库, BeautifulSoup(解析网页, 很好用的一个工具)和python-docx(生成word文档,这个库功能还不是特别强大)
import urllib.request
import os
import shutil
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
national = "国内"
international = "国际"
def get_html_soup(url):#获取解编码后的HTML
html = None
try:
response = urllib.request.urlopen(url, timeout = 10)
html = response.read().decode(encoding = "utf8", errors='ignore')
except Exception as e:
print(e, "please check your network situation")
return None
soup = BeautifulSoup(str(html), "lxml")
return soup
def page_url(url, page_num):#生成带页面的URL
if page_num == 1:
return url
index = url.rfind(".")
return url[0 : index] + "_" + str(page_num) + url[index : ]
def get_title_link(url, pattern):#获取新闻的标题和正文链接
soup = get_html_soup(url