Python爬虫案例四:爬取某个博主的所有文章保存成PDF格式

引入(将图片保存成PDF格式):

测试链接:
https://zq.bookan.com.cn/?t=detail&id=21088&ct=1&is=31042341&rid=4658(图书馆图片保存PDF),前提是装库,pip install img2pdf
具体步骤:
import requests, img2pdf 
url_list = [ 
'http://img1-qn.bookan.com.cn/page8/3234/3234-310411164/4214b5ac_big.mg', 'http://img1-qn.bookan.com.cn/page8/8314/8314-310441286/8522073f_big.mg'
] 
data_list = [requests.get(url).content for url in url_list] 
# 1、准备宽度 + 高度 
width = img2pdf.mm_to_pt(300) 
height = img2pdf.mm_to_pt(300) 
# 2、准备空白的PDF页面 
pdf_size = img2pdf.get_layout_fun((width, height)) // 版图 布局,此时PDF是空的 
# 3、添加数据 (img的数据, pdf的size) 
pdf_data = img2pdf.convert(data_list, layout_fun=pdf_size) 
# 4、保存 
with open('测试.pdf', 'wb') as f: f.write(pdf_data) print('ok')   
测试结果:

 案例实战: 抓取CSDN某位博主的文章并将其保存成PDF格式(先抓取一篇然后批量)

源码:

# 爬虫部分  ====> 代码可复用  
import requests,parsel, pdfkit
from lxml import etree

def get_all():
    # 批量下载博主的全部文章 ==> 取列表页获取所有的url
    info_url = 'https://blog.csdn.net/2301_80014606'
    cookies = {
        'uuid_tt_dd': '10_10174532770-1711812940092-685570',
        'Hm_up_6bcd52f51e9b3dce32bec4a3997715ac': '%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D',
        'cf_clearance': 'LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw',
        'UserName': 'm0_74614835',
        'UserInfo': '8268477d59434a4194615c67cd9ea26d',
        'UserToken': '8268477d59434a4194615c67cd9ea26d',
        'UserNick': 'm0_74614835',
        'AU': '23E',
        'UN': 'm0_74614835',
        'BT': '1718679492185',
        'p_uid': 'U010000',
        'm0_74614835comment_new': '1720492338243',
        'firstDie': '1',
        'Hm_lvt_ec8a58cd84a81850bcbd95ef89524721': '1720707980,1721100768,1721197835,1721446756',
        'c_dl_fref': 'https://www.iteye.com/',
        'c_utm_source': 'iteye',
        'c_dl_prid': '1721446933332_253428',
        'c_dl_rid': '1721446989577_111570',
        'c_dl_fpage': '/download/Programmer_FuQiang/12187394',
        'c_dl_um': 'distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a',
        'c_segment': '2',
        'Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac': '1721100769,1721197835,1721446756,1721521897',
        'HMACCOUNT': '66A8254591DC78E3',
        'https_waf_cookie': '50bed8ef-937c-4dd73e2026e8fe2a1adf6adc04dac5006194',
        'dc_sid': '167e0952bfbaec891be59557bcfa7617',
        '_clck': '16zlski%7C2%7Cfnn%7C0%7C1550',
        'csrfToken': 'SmchSM7MO-KHW5qJeFuboz29',
        '__gads': 'ID=32681eea0b3f2dc4:T=1711812942:RT=1721525715:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w',
        '__gpi': 'UID=00000d7858e98a50:T=1711812942:RT=1721525715:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg',
        '__eoi': 'ID=854a41bc034210ac:T=1711812942:RT=1721525715:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q',
        'c_first_ref': 'default',
        'c_utm_medium': 'distribute.pc_feed_blog_category.none-task-blog-classify_tag-3-139705227-null-null.nonecase',
        'dc_session_id': '10_1721527944582.357468',
        'c_pref': 'default',
        'creativeSetApiNew': '%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D',
        'log_Id_click': '144',
        'waf_captcha_marker': '2ad0ccbd0b616ff0b7f0ead6dcb53d42a740df830a745a7366d633b88332b078',
        'c_ref': 'default',
        'c_first_page': 'https%3A//blog.csdn.net/2301_80014606%3Ftype%3Dblog',
        'c_dsid': '11_1721528107847.945530',
        'c_page_id': 'default',
        'log_Id_pv': '111',
        'Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac': '1721528111',
        'log_Id_view': '3535',
        '_clsk': '112h5bx%7C1721528118664%7C1%7C0%7Cy.clarity.ms%2Fcollect',
        'dc_tos': 'sgybm1',
    }
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        # 'Cookie': 'uuid_tt_dd=10_10174532770-1711812940092-685570; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac=%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D; cf_clearance=LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw; UserName=m0_74614835; UserInfo=8268477d59434a4194615c67cd9ea26d; UserToken=8268477d59434a4194615c67cd9ea26d; UserNick=m0_74614835; AU=23E; UN=m0_74614835; BT=1718679492185; p_uid=U010000; m0_74614835comment_new=1720492338243; firstDie=1; Hm_lvt_ec8a58cd84a81850bcbd95ef89524721=1720707980,1721100768,1721197835,1721446756; c_dl_fref=https://www.iteye.com/; c_utm_source=iteye; c_dl_prid=1721446933332_253428; c_dl_rid=1721446989577_111570; c_dl_fpage=/download/Programmer_FuQiang/12187394; c_dl_um=distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a; c_segment=2; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1721100769,1721197835,1721446756,1721521897; HMACCOUNT=66A8254591DC78E3; https_waf_cookie=50bed8ef-937c-4dd73e2026e8fe2a1adf6adc04dac5006194; dc_sid=167e0952bfbaec891be59557bcfa7617; _clck=16zlski%7C2%7Cfnn%7C0%7C1550; csrfToken=SmchSM7MO-KHW5qJeFuboz29; __gads=ID=32681eea0b3f2dc4:T=1711812942:RT=1721525715:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w; __gpi=UID=00000d7858e98a50:T=1711812942:RT=1721525715:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg; __eoi=ID=854a41bc034210ac:T=1711812942:RT=1721525715:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q; c_first_ref=default; c_utm_medium=distribute.pc_feed_blog_category.none-task-blog-classify_tag-3-139705227-null-null.nonecase; dc_session_id=10_1721527944582.357468; c_pref=default; creativeSetApiNew=%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D; log_Id_click=144; waf_captcha_marker=2ad0ccbd0b616ff0b7f0ead6dcb53d42a740df830a745a7366d633b88332b078; c_ref=default; c_first_page=https%3A//blog.csdn.net/2301_80014606%3Ftype%3Dblog; c_dsid=11_1721528107847.945530; c_page_id=default; log_Id_pv=111; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1721528111; log_Id_view=3535; _clsk=112h5bx%7C1721528118664%7C1%7C0%7Cy.clarity.ms%2Fcollect; dc_tos=sgybm1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    params = {
        'type': 'blog',
    }
    response = requests.get(info_url, params=params, cookies=cookies, headers=headers).text
    A = etree.HTML(response)
    url_list = A.xpath('//article[@class="blog-list-box"]/a/@href')
    for url in url_list:
        get_one(url)

def get_one(url):
    # 下载博客的某一个文章
    cookies = {
        'uuid_tt_dd': '10_10174532770-1711812940092-685570',
        'Hm_up_6bcd52f51e9b3dce32bec4a3997715ac': '%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D',
        'cf_clearance': 'LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw',
        'UserName': 'm0_74614835',
        'UserInfo': '8268477d59434a4194615c67cd9ea26d',
        'UserToken': '8268477d59434a4194615c67cd9ea26d',
        'UserNick': 'm0_74614835',
        'AU': '23E',
        'UN': 'm0_74614835',
        'BT': '1718679492185',
        'p_uid': 'U010000',
        'm0_74614835comment_new': '1720492338243',
        'c_segment': '2',
        'firstDie': '1',
        'Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac': '1720707980,1721100769,1721197835,1721446756',
        'HMACCOUNT': '66A8254591DC78E3',
        'Hm_lvt_ec8a58cd84a81850bcbd95ef89524721': '1720707980,1721100768,1721197835,1721446756',
        'Hm_lpvt_ec8a58cd84a81850bcbd95ef89524721': '1721446756',
        'dc_sid': '37a800e065792e9d25758f94942b724b',
        'c_dl_fref': 'https://www.iteye.com/',
        'c_utm_source': 'iteye',
        '_clck': '16zlski%7C2%7Cfnm%7C0%7C1550',
        'c_dl_prid': '1721446933332_253428',
        'c_dl_rid': '1721446989577_111570',
        'c_dl_fpage': '/download/Programmer_FuQiang/12187394',
        'c_dl_um': 'distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a',
        'dc_session_id': '10_1721489180573.741111',
        'c_first_ref': 'www.baidu.com',
        'c_first_page': 'https%3A//www.csdn.net/',
        'c_dsid': '11_1721489179823.492641',
        'creativeSetApiNew': '%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D',
        'https_waf_cookie': 'b0c0b344-8c52-498bc5a67d7393bbce036bff877d4003edec',
        'log_Id_click': '131',
        'c_pref': 'https%3A//www.csdn.net/',
        'c_ref': 'https%3A//i.csdn.net/',
        'c_page_id': 'default',
        'log_Id_pv': '93',
        'Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac': '1721489207',
        '__gads': 'ID=32681eea0b3f2dc4:T=1711812942:RT=1721489211:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w',
        '__gpi': 'UID=00000d7858e98a50:T=1711812942:RT=1721489211:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg',
        '__eoi': 'ID=854a41bc034210ac:T=1711812942:RT=1721489211:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q',
        '_clsk': '14eo8p7%7C1721489210658%7C1%7C0%7Cx.clarity.ms%2Fcollect',
        'log_Id_view': '2833',
        'waf_captcha_marker': '9c2b63af339e16b26460039c73324b937b80bb04c7836dd3ef90ded0e605b8fe',
        'dc_tos': 'sgxhl0',
    }
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        # 'Cookie': 'uuid_tt_dd=10_10174532770-1711812940092-685570; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac=%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D; cf_clearance=LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw; UserName=m0_74614835; UserInfo=8268477d59434a4194615c67cd9ea26d; UserToken=8268477d59434a4194615c67cd9ea26d; UserNick=m0_74614835; AU=23E; UN=m0_74614835; BT=1718679492185; p_uid=U010000; m0_74614835comment_new=1720492338243; c_segment=2; firstDie=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1720707980,1721100769,1721197835,1721446756; HMACCOUNT=66A8254591DC78E3; Hm_lvt_ec8a58cd84a81850bcbd95ef89524721=1720707980,1721100768,1721197835,1721446756; Hm_lpvt_ec8a58cd84a81850bcbd95ef89524721=1721446756; dc_sid=37a800e065792e9d25758f94942b724b; c_dl_fref=https://www.iteye.com/; c_utm_source=iteye; _clck=16zlski%7C2%7Cfnm%7C0%7C1550; c_dl_prid=1721446933332_253428; c_dl_rid=1721446989577_111570; c_dl_fpage=/download/Programmer_FuQiang/12187394; c_dl_um=distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a; dc_session_id=10_1721489180573.741111; c_first_ref=www.baidu.com; c_first_page=https%3A//www.csdn.net/; c_dsid=11_1721489179823.492641; creativeSetApiNew=%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D; https_waf_cookie=b0c0b344-8c52-498bc5a67d7393bbce036bff877d4003edec; log_Id_click=131; c_pref=https%3A//www.csdn.net/; c_ref=https%3A//i.csdn.net/; c_page_id=default; log_Id_pv=93; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1721489207; __gads=ID=32681eea0b3f2dc4:T=1711812942:RT=1721489211:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w; __gpi=UID=00000d7858e98a50:T=1711812942:RT=1721489211:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg; __eoi=ID=854a41bc034210ac:T=1711812942:RT=1721489211:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q; _clsk=14eo8p7%7C1721489210658%7C1%7C0%7Cx.clarity.ms%2Fcollect; log_Id_view=2833; waf_captcha_marker=9c2b63af339e16b26460039c73324b937b80bb04c7836dd3ef90ded0e605b8fe; dc_tos=sgxhl0',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
    response = requests.get(url, cookies=cookies, headers=headers).text     
    # print(response)  # 将抓取到的文本单独保存成html文件然后进行解析
    # ------xpath取标题-----------
    A = etree.HTML(response)
    title = A.xpath('//h1/text()')[0]  # 标题是h1标签,不一定全是title标签
    # --------css取正文-----------
    B = parsel.Selector(response)
    data = B.css('#content_views').get()
    html_data = \
    '''
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <title>Title</title>
        </head>
        <body>
            {}
        </body>
        </html>
    '''.format(data)
    with open('csdn正文.html', 'w', encoding='utf-8') as f:
        f.writelines(html_data)
    To_pdf(title)

def To_pdf(title):
    # 转化PDF  引入工具 -- 进行转换
    kit = pdfkit.configuration(wkhtmltopdf=r'E:\wkhtmltopdf\bin\wkhtmltopdf.exe')
    pdfkit.from_file('csdn正文.html', f'{title}.pdf', configuration=kit)
    print('保存OK--{}'.format(title))


def main():
    get_all()

if __name__ == '__main__':
    main() 

运行效果(只列举其中一篇文章):

 

  • 14
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值