python的pdf文件处理和图片处理

爬取网站的内容,并且写入pdf文件

  • 首先根据文件的内容,获得html的url。
import re
import requests


reg = re.compile(r"<h3>目录列表</h3>\s+<ul>\s+([\s\S]*?</ul>)")
url = "http://www.apelearn.com/study_v2/"
session = requests.session()
r = session.get(url)
# print(r.encoding)
r.encoding = "utf-8"
html = r.text
# print(html)
htmlli = reg.findall(html)
# print(htmlli)
regurl = re.compile(r'''href="(.*?)"''')
if htmlli[0]:
    result = regurl.findall(htmlli[0])
    # print(result)

urls = list()
for i in result:
    url = "http://www.apelearn.com/study_v2/{0}".format(i)
    print(url)
    urls.append(urls)

#输出结果如下
http://www.apelearn.com/study_v2/chapter1.html
http://www.apelearn.com/study_v2/chapter2.html
http://www.apelearn.com/study_v2/chapter3.html
http://www.apelearn.com/study_v2/chapter4.html
http://www.apelearn.com/study_v2/chapter5.html
http://www.apelearn.com/study_v2/chapter6.html
http://www.apelearn.com/study_v2/chapter7.html
http://www.apelearn.com/study_v2/chapter8.html
http://www.apelearn.com/study_v2/chapter9.html
http://www.apelearn.com/study_v2/chapter10.html
http://www.apelearn.com/study_v2/chapter11.html
http://www.apelearn.com/study_v2/chapter12.html
http://www.apelearn.com/study_v2/chapter13.html
http://www.apelearn.com/study_v2/chapter14.html
http://www.apelearn.com/study_v2/chapter15.html
http://www.apelearn.com/study_v2/chapter16.html
http://www.apelearn.com/study_v2/chapter17.html
http://www.apelearn.com/study_v2/chapter18.html
http://www.apelearn.com/study_v2/chapter19.html
http://www.apelearn.com/study_v2/chapter20.html
http://www.apelearn.com/study_v2/chapter21.html
http://www.apelearn.com/study_v2/chapter22.html
http://www.apelearn.com/study_v2/chapter23.html
http://www.apelearn.com/study_v2/chapter24.html
http://www.apelearn.com/study_v2/chapter25.html
http://www.apelearn.com/study_v2/chapter26.html
  • 根据url生成单个的pdf文件,通过下面的代码就可以看到自动创建一个aming目录,每一章的内容生成pdf文件
import codecs
import os
import sys

import pdfkit
import requests

base_url = 'http://www.apelearn.com/study_v2/'
if not os.path.exists("aming"):
    os.mkdir("aming")

os.chdir("aming")
s = requests.session()

for i in range(1, 27):
    url = base_url + 'chapter' + str(i) + '.html'
    print(url)
    file = str(i) + '.pdf'
    print(file)
    config = pdfkit.configuration(wkhtmltopdf=r"D:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe")
    try:
        pdfkit.from_url(url, file)
    except:
        continue
  • 导入PyPDF2模块,出席pdf相关的操作
import codecs

import PyPDF2
import os

files = list()
for fileName in os.listdir("aming"):
    if fileName.endswith(".pdf"):
        files.append(fileName)

newFiles = sorted(files, key=lambda d: int(d.split(".pdf")[0]))
print(newFiles)


os.chdir("aming")
pdfWriter = PyPDF2.PdfFileWriter()#生成一个空白的pdf
for item in newFiles:
    pdfReader = PyPDF2.PdfFileReader(open(item, "rb"))
    for page in range(pdfReader.numPages):
        pdfWriter.addPage(pdfReader.getPage(page))

with codecs.open("aminglinux.pdf", "wb") as f:
    pdfWriter.write(f)
  • 在aming目录下面可以看到aminglinux.pdf文件

pillow模块的操作

  • 图片的裁剪
from PIL import Image
image = Image.open("test.jpg")

cutjpg = image.crop((320, 65, 460, 220))
cutjpg.show()
  • 裁剪图片,对图片旋转,粘贴
from PIL import Image
image = Image.open("test.jpg")
position = (320, 65, 460, 220)
cutjpg = image.crop(position).transpose(Image.ROTATE_180)
image.paste(cutjpg, position)
image.show()
  • 对图片设置大小
from PIL import Image
image = Image.open("test.jpg")
position = (320, 65, 460, 220)
cutjpg = image.crop(position).transpose(Image.ROTATE_180)
image.paste(cutjpg, position)

(x, y) = image.size
newx = 30
newy = int(y*newx/x)
newimage = image.resize((newx, newy))
newimage.show()
  • 使用python编写验证码的程序(从廖雪峰网站找到一个例子程序,根据注释,很好理解。)
from PIL import Image, ImageDraw, ImageFont, ImageFilter

import random

# 随机字母:
def rndChar():
    return chr(random.randint(65, 90))

# 随机颜色1:
def rndColor():
    return (random.randint(64, 255), random.randint(64, 255), random.randint(64, 255))

# 随机颜色2:
def rndColor2():
    return (random.randint(32, 127), random.randint(32, 127), random.randint(32, 127))

# 240 x 60:
width = 60 * 4
height = 60
image = Image.new('RGB', (width, height), (255, 255, 255))
# 创建Font对象:
font = ImageFont.truetype('Arvo-Regular.ttf', 36)
# 创建Draw对象:
draw = ImageDraw.Draw(image)
# 填充每个像素:
for x in range(width):
    for y in range(height):
        draw.point((x, y), fill=rndColor())
# 输出文字:
for t in range(4):
    draw.text((60 * t + 10, 10), rndChar(), font=font, fill=rndColor2())
# 模糊:
# image = image.filter(ImageFilter.BLUR)
image.save('code.jpg', 'jpeg')
image.show()
  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值