Python学习_30
数据处理(二)
1、抓取aming的linux教程,然后制作成pdf文件
a、先抓取每个网页
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018\6\11 0011 22:02
# @Author : xiexiaolong
# @File : demon.py
import codecs
import os
import sys
import pdfkit
import requests
if not os.path.exists("aming"):
os.mkdir("aming")
os.chdir("aming")
s = requests.session()
for i in range(1, 27):
url = base_url + 'chapter' + str(i) + '.html'
print(url)
file = str(i) + '.pdf'
print(file)
config = pdfkit.configuration(wkhtmltopdf=r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe")
try:
pdfkit.from_url(url, file)
except:
continue
b、合并到新的pdf文件
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018\6\11 0011 21:45
# @Author : xiexiaolong
# @File : demon2.py
import codecs
import PyPDF2
import os
files = list()
for fileName in os.listdir("aming"):
if fileName.endswith(".pdf"):
files.append(fileName)
newFiles = sorted(files, key=lambda d: int(d.split(".pdf")[0]))
print(newFiles)
os.chdir("aming")
pdfWriter = PyPDF2.PdfFileWriter()#生成一个空白的pdf
for item in newFiles:
pdfReader = PyPDF2.PdfFileReader(open(item, "rb"))
for page in range(pdfReader.numPages):
pdfWriter.addPage(pdfReader.getPage(page))
with codecs.open("aminglinux.pdf", "wb") as f:
pdfWriter.write(f)
也可以通过正则匹配
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018\6\11 0011 21:40
# @Author : xiexiaolong
# @File : demon1.py
import re
import requests
session = requests.session()
r = session.get(url)
# print(r.encoding)
r.encoding = "utf-8"
html = r.text
# print(html)
htmlli = reg.findall(html)
# print(htmlli)
if htmlli[0]:
result = regurl.findall(htmlli[0])
# print(result)
urls = list()
for i in result:
print(url)
urls.append(urls)
图片处理
1、Image
python的图片处理是第三方库
PIL (Python Imaging Library)来处理,如果是python3,则是pillow模块
直接用pip install pillow安装
导入模块用from PIL import Image 导入
打开一张图面
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018\6\11 0011 22:23
# @Author : xiexiaolong
# @File : demon3.py
from
PIL
import
Image
image = Image.open(
"12.jpg"
)
print
(image.format, image.size, image.mode)
image.show() ##用图片查看器打开图片
2、处理图片的方法
#image的方法
#image.show()
#image.open(file)
#image.save(outputfile)
#image.crop(left, upper, right, lower)#抠图
# Image的几何处理:
# out = im.resize((128, 128)) #调整图片大小
# out = im.rotate(45) #逆时针旋转 45 度角。
# out = im.transpose(Image.FLIP_LEFT_RIGHT) #左右对换。
# out = im.transpose(Image.FLIP_TOP_BOTTOM) #上下对换。
# out = im.transpose(Image.ROTATE_90) #旋转 90 度角。
# out = im.transpose(Image.ROTATE_180) #旋转 180 度角。
# out = im.transpose(Image.ROTATE_270) #旋转 270 度角。
3、抠图
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018\6\11 0011 22:23
# @Author : xiexiaolong
# @File : demon3.py
from
PIL
import
Image
image = Image.open(
"12.jpg"
)
print
(image.format, image.size, image.mode)
box = (
330
,
90
,
452
,
232
)
region = image.crop(box)
region.save(
"cutting.jpg"
)
分析:抠图,并保存新图片到cutting.jpg中
4、图片合并
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018\6\11 0011 22:23
# @Author : xiexiaolong
# @File : demon3.py
from
PIL
import
Image
image = Image.open(
"12.jpg"
)
print
(image.format, image.size, image.mode)
box = (
330
,
90
,
452
,
232
)
egion = image.crop(box)
region = egion.transpose(Image.ROTATE_180)
image.paste(region,box)
image.show()
5、缩放
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018\6\11 0011 22:23
# @Author : xiexiaolong
# @File : demon3.py
from
PIL
import
Image
infile =
"12.jpg"
outfile =
"new12.jpg"
image = Image.open(infile)
(x,y)= image.size
newx =
300
newy =
int
(y*newx/x)
out = image.resize((newx, newy))
out.show()
注意:
out = image.resize((newx, newy)) 这里应该是元组
6、验证码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018\6\11 0011 22:23
# @Author : xiexiaolong
# @File : demon3.py
import random
import string
from PIL import Image, ImageFont, ImageDraw, ImageFilter
font_path = "msyh.ttf"
number = 4
size = (100, 30)
bgcolor = (255, 255, 255)
fontcolor = (0, 0, 255)
linecolor = (255, 0, 0)
draw_line = True
# 加入干扰线条数的上下限
line_number = 3
#生成一个随机字符串
def getNumber():
source = list(string.ascii_letters) + list(string.digits)
return "".join(random.sample(source, number))
#绘制干扰线
def getLine(draw, width, height):
begin = random.randint(0, width), random.randint(0, height)
end = random.randint(0, width), random.randint(0, height)
draw.line([begin, end], fill=linecolor)
def getCode():
width, height = size
image = Image.new("RGBA", size, bgcolor)
font = ImageFont.truetype(font_path, 25)
draw = ImageDraw.Draw(image)
text = getNumber()
font_width, font_height = font.getsize(text)
draw.text(((width - font_width) / 2, (height - font_height) / 2), text, font=font, fill=fontcolor) # 填充字符串
if draw_line:
for i in range(line_number):
getLine(draw, width, height)
# image = image.transform((width + 20, height + 10), Image.AFFINE, (1, -0.3, 0, -0.1, 1, 0), Image.BILINEAR) # 创建扭曲
image = image.filter(ImageFilter.EDGE_ENHANCE_MORE) # 滤镜,边界加强
image.save('idencode.png') # 保存验证码图片
# image.show()
if __name__ == '__main__':
getCode()