{article}
“”"
def save_article(article, title):
html_path = ‘重庆新闻\’ + title + ‘.html’
pdf_path = ‘重庆新闻pdf\’ + title + ‘.pdf’
html = html_str.format(article=article)
with open(html_path, mode=‘w’, encoding=‘utf-8’) as f:
f.write(html)
print(‘{}已下载完成’.format(title))
exe 文件存放的路径
config = pdfkit.configuration(wkhtmltopdf=‘C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe’)
把 html 通过 pdfkit 变成 pdf 文件
pdfkit.from_file(html_path, pdf_path, configuration=config)
- 保存csv文件
f = open(‘新闻.csv’, mode=‘a’, encoding=‘utf-8-sig’, newline=‘’)
csv_writer = csv.DictWriter(f, fieldnames=[‘标题’, ‘作者’, ‘日期’, ‘新闻详情页’])
csv_writer.writeheader()
- 保存图片
def save_img(img_urls):
for i in img_urls:
img_url = ‘http://news.cqu.edu.cn’ + i
img_content = get_html(img_url).content
img_name = img_url.split(‘/’)[-1]
with open(‘新闻图片\’ + img_name, mode=‘wb’) as f:
f.write(img_content)
- 主函数
def main(url):
html_data = get_html(url).text
selector = get_pars(html_data)
lis = selector.css(‘body > div:nth-child(4) > div > div.lists .title a::attr(href)’).getall()
for li in lis:
content_data = get_html(li).text
li_selector = get_pars(co