多线程的应用
前程无忧岗位爬虫
import requests
from re import search
from multiprocessing import Process, Queue
from concurrent.futures import ThreadPoolExecutor
from threading import Thread
import json, csv
def get_html(name, page, queue):
print(f'====={name}: {page}页数据开始获取======')
url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,{name},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
headers = {
'cookie': '_uab_collina=164732602034885325275861; guid=bc15f226f677117c7fd664ae75d88456; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; privacy=1648085824; acw_tc=781bad3516480858282898541e362354335159dc5f48379d417d901e48e869; acw_sc__v2=623bcb911548c2c555b7cb99fe6e256d2b444e9b; search=jobarea%7E%60000000%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CA%FD%BE%DD%B7%D6%CE%F6%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60000000%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FAjava%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21; ssxmod_itna=GqIh0KGKDI1GkDzpD2eEfhqWqYvnTuRcIePDsWkrDSxGKidDqxBWeC+FeGQdMktzC4YhdMnCbu+WMzngntaioC8WrD=xYQDwxYoDUxGtDpxG6orDTUQDWDYPGWDDbHDRxY=Du2KDaHvq0rcpPD0gR1kpvC5irqvAIOnZcFDB4NoDa4rQRrrjr3ei2DsjD48804fli4eC3L5i0KiBnPAOxPDDawyzhDD=; ssxmod_itna2=GqIh0KGKDI1GkDzpD2eEfhqWqYvnTuRc9qikIDRCjD0yxcq03EWeWRWSN56D67iIN1gzlwR4hoRcgBxhW9mw0Xpqp+EPtsq3/b7yj2GIbKhe6YYjQpIYP+1TdLBRFzYuFbTKZB88PBvN7Oez/nrtZ7B72nbbqEI7irAziPONuQesqlchoIKCzOcTb4mHI+H3KlTIdcYmm183y=MvqhhOoA9=ojr3aP8PR=e1UFBKkEd=V4WIEQT3=guuix=vqhYHIrPPE7lbNQ0DZSAQoaTgqxTtAOqW8iqP6e6pBm+CW7PxY9YUD1UdbRgWaSYprUT7QbHK4kShV7nc+4nCc4QGC7hQ32vbxDuxvRHt87pz4xvIiYp57PWQBNRF3S4SiPrn2NzRnbY0IKkCuSUP=SDx/BQb+kF4sWnZYamfBx8fFBTmQRQEWWA2rVRkADv3wL8bO+e8BwHe7KsDaQW6UxeFiNS4t2pFuu/mxKX2l7vakxe+5+B7Glh/7qu8u42iO+95bwWI9gmT6ne=B94RWip25auGIRljoViqqIeMnR4qwlzTIkWLRuVe4+9iD=yWWnFWThUubmZoprh94Fmv9D+LR+Nt0UF+y/xciI60cdH9NC6QSGY5W9YDkRy5Vliz0QSIoD07kGPYAoeYooAqcx4NrlQ6qD7=DY94eD==',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'
}
response = requests.get(url, headers=headers)
result = search(r'window.__SEARCH_RESULT__ = (.+?)</script>', response.text).group(1)
print(f'====={name}: {page}页数据获取完成======')
queue.put(result)
def get_100_page_data(name, queue):
pool = ThreadPoolExecutor(100)
for page in range(1, 1001):
pool.submit(get_html, name, page, queue)
pool.shutdown()
def save_data():
while True:
data = q.get()
if data == 'end':
break
all_data = []
for job in json.loads(data)['engine_jds']:
job_name = job['job_name']
providesalary_text = job['providesalary_text']
workarea_text = job['workarea_text']
company_name = job['company_name']
all_data.append([job_name, providesalary_text, workarea_text, company_name])
writer.writerows(all_data)
if __name__ == '__main__':
q = Queue()
writer = csv.writer(open('files/岗位信息.csv', 'w', encoding='utf-8', newline=''))
writer.writerow(['岗位', '薪资', '工作地点', '公司'])
t = Thread(target=save_data)
t.start()
p1 = Process(target=get_100_page_data, args=('数据分析', q))
p2 = Process(target=get_100_page_data, args=('java', q))
p1.start()
p2.start()
p1.join()
p2.join()
q.put('end')
pdf文件读操作
from PyPDF2 import PdfFileReader
reader = PdfFileReader(open('files/存储引擎的讲解.pdf', 'rb'))
num = reader.getNumPages()
print(num)
for index in range(num):
page = reader.getPage(index)
pdf文件写操作
from PyPDF2 import PdfFileWriter, PdfFileReader
reader = PdfFileReader(open('files/存储引擎的讲解.pdf', 'rb'))
writer = PdfFileWriter()
nums = reader.getNumPages()
for x in range(1, nums):
writer.addPage(reader.getPage(x))
writer.write(open('files/new.pdf', 'wb'))
合并pdf文件
from PyPDF2 import PdfFileReader, PdfFileWriter
reader1 = PdfFileReader(open('files/千锋Python人工智能+数据分析课程大纲2021版.pdf', 'rb'))
reader2 = PdfFileReader(open('files/存储引擎的讲解.pdf', 'rb'))
writer = PdfFileWriter()
for index in range(reader1.getNumPages()):
writer.addPage(reader1.getPage(index))
for index in range(reader2.getNumPages()):
writer.addPage(reader2.getPage(index))
writer.write(open('files/合并.pdf', 'wb'))
页面相关操作
from PyPDF2 import PdfFileWriter, PdfFileReader
reader1 = PdfFileReader(open('files/存储引擎的讲解.pdf', 'rb'))
reader2 = PdfFileReader(open('files/学习路线图水印.pdf', 'rb'))
writer = PdfFileWriter()
walter = reader2.getPage(0)
for x in range(reader1.getNumPages()):
page = reader1.getPage(x)
page.mergePage(walter)
writer.addPage(page)
writer.write(open('files/页面操作.pdf', 'wb'))
创建水印
from reportlab.pdfgen.canvas import Canvas
from reportlab.pdfbase.pdfmetrics import registerFont
from reportlab.pdfbase.ttfonts import TTFont
pdf = Canvas('files/water.pdf')
registerFont(TTFont('F1', 'files/cc.ttf'))
registerFont(TTFont('F2', 'files/dd.ttf'))
pdf.setFont('F2', 30)
pdf.setFillColorRGB(0.4, 0.4, 0.4, 0.6)
pdf.drawString(450, 300, '你好吗?')
pdf.setFont('F1', 40)
pdf.setFillColorRGB(1, 0, 0, 0.5)
pdf.rotate(45)
pdf.drawString(300, 100, '你好吗?')
pdf.drawImage('files/apple.png', 300, 120)
pdf.save()
批量添加水印
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
def add_water(path):
"""
给一个pdf文件添加水印
"""
writer = PdfFileWriter()
reader = PdfFileReader(open(path, 'rb'))
for x in range(reader.getNumPages()):
page = reader.getPage(x)
page.mergePage(water)
writer.addPage(page)
file_name = os.path.basename(path)
writer.write(open(f'out/{file_name}', 'wb'))
def add_all_water():
"""给in文件夹中所有的pdf文件添加水印"""
for name in os.listdir('./in'):
path = os.path.join('./in', name)
add_water(path)
if __name__ == '__main__':
water = PdfFileReader(open('files/学习路线图水印.pdf', 'rb')).getPage(0)
add_all_water()