一、爬虫简述
示例
# # urls = list() # # for i in range(1, 27): # url = 'http://www.apelearn.com/study_v2/chapter{0}.html'.format(i) # # print(url) # urls.append(url) # # print(urls) import re import requests reg = re.compile(r"<h3>目录列表</h3>\s+<ul>\s+([\s\S]*?</ul>)") url = "http://www.apelearn.com/study_v2/" session = requests.session() r = session.get(url) # print(r.encoding) r.encoding = "utf-8" html = r.text # print(html) htmlli = reg.findall(html) # print(htmlli) regurl = re.compile(r'''href="(.*?)"''') if htmlli[0]: result = regurl.findall(htmlli[0]) # print(result) urls = list() for i in result: url = "http://www.apelearn.com/study_v2/{0}".format(i) print(url) urls.append(urls)
二、Python处理pdf文件
2.1.安装PyPDF
pip3 install PyPDF2
2.2 示例
import PyPDF2 import os files=list() for fileName in os.listdir('test'): if fileName.endswith(".pdf"): files.append(fileName) newFile=sorted(files,key=lambda d:int(d.split('.pdf')[0])) print(newFile) os.chdir('test') #生成一个空白的pdf pdfWriter=PyPDF2.PdfFileWriter() for itme in newFile: pdfReader=PyPDF2.PdfFileReader(open(itme,'rb')) for page in range(pdfReader.numPages): pdfWriter.addPage(pdfReader.getPage(page)) with codecs.open("testlinux.pdf","wb") as f: pdfWriter.write(f)
三、处理图片
3.1.安装pillow模块
pip install pillow
3.2.示例1
from PIL import Image image = Image.open("test.jpg") print(image.format, image.size, image.mode) image.show() #image的方法 #image.show() #image.open(file) #image.save(outputfile) #image.crop(left, upper, right, lower)#抠图 # Image的几何处理: # out = im.resize((128, 128)) #调整图片大小 # out = im.rotate(45) #逆时针旋转 45 度角。 # out = im.transpose(Image.FLIP_LEFT_RIGHT) #左右对换。 # out = im.transpose(Image.FLIP_TOP_BOTTOM) #上下对换。 # out = im.transpose(Image.ROTATE_90) #旋转 90 度角。 # out = im.transpose(Image.ROTATE_180) #旋转 180 度角。 # out = im.transpose(Image.ROTATE_270) #旋转 270 度角。
3.3.示例2
from PIL import Image image = Image.open("test.jpg") cutjpg = image.crop((320, 65, 460, 220)) cutjpg.show()