python html转pdf 调整大小_使用 Python 将 HTML 转成 PDF

#coding=utf-8

from __future__ import unicode_literals

import os,sys,re,time

import requests,codecs

from bs4 import BeautifulSoup

from urllib.parse import urlparse

import pdfkit

import platform

requests.packages.urllib3.disable_warnings()

system=platform.system()

print(sys.getdefaultencoding())

str_encode='gbk' if system is 'Windows' else 'utf-8'

print(str_encode)

html_template = """

{content}

"""

if not os.path.exists(os.path.join(os.path.dirname(__file__),'html')):

os.mkdir(os.path.join(os.path.dirname(__file__),'html'))

url_list=[]

start_url='http://www.ziqiangxuetang.com/django/django-tutorial.html'

# s=requests.session()

# html_doc=s.get('{}'.format(start_url),verify=False).content

# soup = BeautifulSoup(html_doc,'html.parser')

# print(soup.prettify())

def get_url_list(url):

"""

获取所有URL目录列表

:return:

"""

last_position = find_last(url, "/") + 1

tutorial_url_head = url[0:last_position]

domain = get_domain(url) + "/"

print(domain)

response = requests.get(url)

soup = BeautifulSoup(response.content, "html.parser")

urls = []

for a in soup.find_all("a"):

href = str(a.get('href'))

result = href.find('/')

if result == -1:

url = tutorial_url_head + href

else:

url = domain + href

if 'django' in url:

urls.append(url)

return urls

def find_last(string, char):

last_position = -1

while True:

position = string.find(char, last_position + 1)

if position == -1:

return last_position

last_position = position

def get_domain(url):

r = urlparse(url)

return r.scheme + "://" + r.netloc

def parse_url_to_html(url,name):

"""

解析URL,返回HTML内容

:param url:解析的url

:param name: 保存的html文件名

:return: html

"""

try:

response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# 正文

body = soup.find_all(class_="w-col l10 m12")

h = str(body)

html = h[1:-1]

html = html_template.format(content=html)

html = html.encode("utf-8")

title=soup.title.get_text()

print(url)

with open('{}/{}'.format(os.path.join(os.path.dirname(__file__),'html'),name), 'wb') as f:

f.write(html)

return '{}/{}'.format(os.path.join(os.path.dirname(__file__),'html'),name)

except Exception as e:

print(e)

def save_pdf(htmls, file_name):

"""

把所有html文件保存到pdf文件

:param htmls: html文件列表

:param file_name: pdf文件名

:return:

"""

options = {

'page-size': 'Letter',

'margin-top': '0.75in',

'margin-right': '0.75in',

'margin-bottom': '0.75in',

'margin-left': '0.75in',

'encoding': "UTF-8",

'custom-header': [

('Accept-Encoding', 'gzip')

],

'cookie': [

('cookie-name1', 'cookie-value1'),

('cookie-name2', 'cookie-value2'),

],

'outline-depth': 10,

}

pdfkit.from_file(htmls, file_name, options=options)

def main():

start = time.time()

urls = get_url_list(start_url)

htmls = [parse_url_to_html(url, str(index) + ".html") for index, url in enumerate(urls)]

print(htmls)

try:

save_pdf(htmls, 'cralwer_{}.pdf'.format(time.strftime('%Y_%m_%d_%H_%M_%S')))

except Exception as e:

print(e)

for html in htmls:

os.remove(html)

total_time = time.time() - start

print(u"总共耗时:{0:.2f}秒".format(total_time))

main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值