Python学习_30 数据处理（二）

燕-孑

已于 2024-03-14 16:47:49 修改

阅读量196

点赞数

分类专栏：猿课笔记_python 文章标签： python 学习开发语言

于 2018-06-11 23:04:25 首次发布

本文链接：https://blog.csdn.net/u011200965/article/details/80659240

版权

猿课笔记_python 专栏收录该内容

33 篇文章 0 订阅

订阅专栏

Python学习_30 数据处理（二）

1、抓取aming的linux教程，然后制作成pdf文件

a、先抓取每个网页

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Time : 2018\6\11 0011 22:02

# @Author : xiexiaolong

# @File : demon.py

import codecs

import os

import sys

import pdfkit

import requests

base_url = ' Linux入门教程_linux电子书 - 跟阿铭学linux(第2版) '

if not os.path.exists("aming"):

os.mkdir("aming")

os.chdir("aming")

s = requests.session()

for i in range(1, 27):

url = base_url + 'chapter' + str(i) + '.html'

print(url)

file = str(i) + '.pdf'

print(file)

config = pdfkit.configuration(wkhtmltopdf=r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe")

try:

pdfkit.from_url(url, file)

except:

continue

b、合并到新的pdf文件

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Time : 2018\6\11 0011 21:45

# @Author : xiexiaolong

# @File : demon2.py

import codecs

import PyPDF2

import os

files = list()

for fileName in os.listdir("aming"):

if fileName.endswith(".pdf"):

files.append(fileName)

newFiles = sorted(files, key=lambda d: int(d.split(".pdf")[0]))

print(newFiles)

os.chdir("aming")

pdfWriter = PyPDF2.PdfFileWriter()#生成一个空白的pdf

for item in newFiles:

pdfReader = PyPDF2.PdfFileReader(open(item, "rb"))

for page in range(pdfReader.numPages):

pdfWriter.addPage(pdfReader.getPage(page))

with codecs.open("aminglinux.pdf", "wb") as f:

pdfWriter.write(f)

也可以通过正则匹配

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Time : 2018\6\11 0011 21:40

# @Author : xiexiaolong

# @File : demon1.py

import re

import requests

reg = re.compile(r "<h3>目录列表</h3>\s+<ul>\s+([\s\S]*?</ul>)")

url = " Linux入门教程_linux电子书 - 跟阿铭学linux(第2版) "

session = requests.session()

r = session.get(url)

# print(r.encoding)

r.encoding = "utf-8"

html = r.text

# print(html)

htmlli = reg.findall(html)

# print(htmlli)

regurl = re.compile(r '''href="(.*?)"''')

if htmlli[0]:

result = regurl.findall(htmlli[0])

# print(result)

urls = list()

for i in result:

url = " Linux入门教程_linux电子书 - 跟阿铭学linux(第2版) {0}".format(i)

print(url)

urls.append(urls)

图片处理

1、Image

python的图片处理是第三方库 PIL （Python Imaging Library）来处理，如果是python3，则是pillow模块

直接用pip install pillow安装

导入模块用from PIL import Image 导入

打开一张图面

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Time : 2018\6\11 0011 22:23

# @Author : xiexiaolong

# @File : demon3.py

from PIL import Image

image = Image.open( "12.jpg" )

print (image.format, image.size, image.mode)

image.show() ##用图片查看器打开图片

2、处理图片的方法

#image的方法

#image.show()

#image.open(file)

#image.save(outputfile)

#image.crop(left, upper, right, lower)#抠图

# Image的几何处理：

# out = im.resize((128, 128)) #调整图片大小

# out = im.rotate(45) #逆时针旋转 45 度角。

# out = im.transpose(Image.FLIP_LEFT_RIGHT) #左右对换。

# out = im.transpose(Image.FLIP_TOP_BOTTOM) #上下对换。

# out = im.transpose(Image.ROTATE_90) #旋转 90 度角。

# out = im.transpose(Image.ROTATE_180) #旋转 180 度角。

# out = im.transpose(Image.ROTATE_270) #旋转 270 度角。

3、抠图

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Time : 2018\6\11 0011 22:23

# @Author : xiexiaolong

# @File : demon3.py

from PIL import Image

image = Image.open( "12.jpg" )

print (image.format, image.size, image.mode)

box = ( 330 , 90 , 452 , 232 )

region = image.crop(box)

region.save( "cutting.jpg" )

分析：抠图，并保存新图片到cutting.jpg中

4、图片合并

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Time : 2018\6\11 0011 22:23

# @Author : xiexiaolong

# @File : demon3.py

from PIL import Image

image = Image.open( "12.jpg" )

print (image.format, image.size, image.mode)

box = ( 330 , 90 , 452 , 232 )

egion = image.crop(box)

region = egion.transpose(Image.ROTATE_180)

image.paste(region,box)

image.show()

5、缩放

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Time : 2018\6\11 0011 22:23

# @Author : xiexiaolong

# @File : demon3.py

from PIL import Image

infile = "12.jpg"

outfile = "new12.jpg"

image = Image.open(infile)

(x,y)= image.size

newx = 300

newy = int (y*newx/x)

out = image.resize((newx, newy))

out.show()

注意： out = image.resize((newx, newy)) 这里应该是元组

6、验证码

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# @Time : 2018\6\11 0011 22:23

# @Author : xiexiaolong

# @File : demon3.py

import random

import string

from PIL import Image, ImageFont, ImageDraw, ImageFilter

font_path = "msyh.ttf"

number = 4

size = (100, 30)

bgcolor = (255, 255, 255)

fontcolor = (0, 0, 255)

linecolor = (255, 0, 0)

draw_line = True

# 加入干扰线条数的上下限

line_number = 3

#生成一个随机字符串

def getNumber():

source = list(string.ascii_letters) + list(string.digits)

return "".join(random.sample(source, number))

#绘制干扰线

def getLine(draw, width, height):

begin = random.randint(0, width), random.randint(0, height)

end = random.randint(0, width), random.randint(0, height)

draw.line([begin, end], fill=linecolor)

def getCode():

width, height = size

image = Image.new("RGBA", size, bgcolor)

font = ImageFont.truetype(font_path, 25)

draw = ImageDraw.Draw(image)

text = getNumber()

font_width, font_height = font.getsize(text)

draw.text(((width - font_width) / 2, (height - font_height) / 2), text, font=font, fill=fontcolor) # 填充字符串

if draw_line:

for i in range(line_number):

getLine(draw, width, height)

# image = image.transform((width + 20, height + 10), Image.AFFINE, (1, -0.3, 0, -0.1, 1, 0), Image.BILINEAR) # 创建扭曲

image = image.filter(ImageFilter.EDGE_ENHANCE_MORE) # 滤镜，边界加强

image.save('idencode.png') # 保存验证码图片

# image.show()

if __name__ == '__main__':

getCode()

燕-孑

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
Python学习_30 数据处理（二）

Python学习_30数据处理（二）1、抓取aming的linux教程，然后制作成pdf文件a、先抓取每个网页#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time: 2018\6\11 0011 22:02# @Author: xiexiaolong# @File: demon.pyimport codecsimport ...
复制链接

扫一扫