爬取所有博客的内容并转换成为pdf格式
from bs4 import BeautifulSoup
import pdfkit
import re
# <a href="https://blog.csdn.net/qq_41911569/article/details/83034422" target="_blank"><span class="">查看</span></a>
from gevent import os
def getPagehtml(url): #获取网页的内容
response = requests.get(url)
return response.text
def createurl(text): #从网页源码中匹配到每一片博客网址
'''
<a href="https://blog.csdn.net/qq_41911569/article/details/83034422" target="_blank"><span class="article-type type-1">原</span>爬取猫眼电影</a>
:param text:
:return:
'''
pattern = r'<a href="(https://blog.csdn.net/qq_41911569/article/.*?)" target="_blank">'
return re.findall(pattern,text)
url = 'https://blog.csdn.net/qq_41911569'
text = getPagehtml(url)
createurl(text)
def get_blog_content(i,url): #根据获取到的每一片的博客网址,获得博客的内容,并写入文件中
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html5lib')
# 获取head标签的内容
head = soup.head
# 获取博客标题
title = soup.find_all(class_="title-article")[0].get_text()
# 获取博客内容
content = soup.find_all(class_="article_content")[0]
# 写入本地文件
other = 'http://passport.csdn.net/account/login?from='
with open('/home/kiosk/Desktop/python笔记/python_stack/day26/bs/westos%d.html' %i, 'w') as f:
f.write(str(head))
f.write('<h1>%s</h1>\n\n' %(title))
f.write(str(content))
def main():
# https://blog.csdn.net/qq_41911569/article/list/3
article_url = []
for i in range(3):
url = 'https://blog.csdn.net/qq_41911569/article/list/%d' %(i+1)
text = getPagehtml(url)
article_url.append(createurl(text))
article_url = [j for i in article_url for j in i]
# print(article_url)
for i,v in enumerate(set(article_url)):
get_blog_content(i,v)
main()
结果: