xie_xlsx
# coding=gbk
import openpyxl
import re
class xie_xlsx():
def __init__(self,name,biandan='表01'):
self.name01=name
self.biaodan=biandan
def dubiao(self): #把文件里的内容按行读到列表zhuan02
workbook = openpyxl.load_workbook(self.name01) #一个Workbook对象代表一个Excel文档
sheet = workbook[self.biaodan] #选取表单
#读取表单数据
zhuan02=[] #列表
for row in sheet.rows:
zhuan01=[]
for cell in row: #按行读
zhuan01.append(cell.value)
zhuan02.append(zhuan01)
return zhuan02
#创建表格
def chuangbiao(self,tou):
workbook = openpyxl.Workbook() #创建文档
sheet = workbook.active
sheet.title =self.biaodan #工作表的名字
sheet.append(tou)
workbook.save(self.name01) #保存文件
print("表格创建成功。。。。。。。")
# 对表进行修改,追加
def xiubiao02(self,data_list): #把data_list里的数读到zhuan01列表,并添加到表单里
wb = openpyxl.load_workbook(self.name01) #打开工作簿
ws = wb[self.biaodan]
zhuan01=[]
for i01 in data_list:
zhuan01.append(str(i01))
ws.append(zhuan01)
wb.save(self.name01)
print('数据已更新。。。。。。。。。。。。')
# 对表进行修改,追加
def xiubiao03(self,data_list):
wb = openpyxl.load_workbook(self.name01)
ws = wb[self.biaodan]
for data in data_list:
zhuan01=[]
for i01 in data:
zhuan01.append(str(i01))
ws.append(zhuan01)
wb.save(self.name01)
print('数据集已更新。。。。。。。。。。。。')
def xiubiao04(self,data_list):
wb = openpyxl.load_workbook(self.name01) #打开工作簿
ws = wb[self.biaodan]
for i01 in data_list:
zhuan01=[]
zhuan01.append(str(i01))
ws.append(zhuan01)
wb.save(self.name01)
print('数据已更新。。。。。。。。。。。。')
try:
from 我的工具 import xie_xlsx
except:
import xie_xlsx
from bs4 import BeautifulSoup
import time
import requests #爬取
import re
import openpyxl
import multiprocessing
yue01=xie_xlsx.xie_xlsx('爬虫结果终601-1200.xlsx')
yue01.chuangbiao([])
def qing(url):
while 1:
try:
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0",
}
requests.packages.urllib3.disable_warnings()
r = requests.get(url,headers=headers,verify=False,timeout=10)
r.encoding='utf-8'
return r.text
except Exception as e:
print('请求错误:',e)
sleep(1)
def run(in_q, out_q):
while in_q.empty() is not True:
url=in_q.get()
print(url)
data01 = qing(url)
bs01=BeautifulSoup(data01,'lxml')
n01=bs01.find_all('tr')
for i01 in l01:
data01=data01.replace(str(i01),'')
bs01=BeautifulSoup(data01,'lxml')
n01=bs01.find_all('tr')
nr_list=[]
for i01 in range(0,len(n01)):
n02=n01[i01].find_all(re.compile('th|td'))
n002=[x.text.replace('\n',' ').strip() for x in n02]
if len(n002)>1 and n002[0].strip():
nr_list=nr_list+n002
out_q.put(nr_list)
in_q.task_done()
return out_q
def get_link():
#读取表格部分,表格里是已经爬取的网址
data = openpyxl.load_workbook('D:/学习/爬取程序/网址601-1200.xlsx')
print('读取成功')
table = data['表01']
return table
#多进程操作
if __name__ == '__main__':
start = time.time()
print(start)
queue = multiprocessing.Manager().Queue()
result_queue = multiprocessing.Manager().Queue()
print('构建成功')
table = get_link()
zhuan02=[]
for row in table.rows:
for cell in row:
queue.put(cell.value) #添加队列
zhuan02.append(cell.value)
print(zhuan02)
print('queue 开始大小 %d' % queue.qsize())
pool = multiprocessing.Pool(8) # 异步进程池(非阻塞)
for index in range(len(zhuan02)):
pool.apply_async(run, args=(queue, result_queue,))
print('第{}个。。。。'.format(index))
pool.close()
print('close')
pool.join()
print('join')
time.sleep(2)
isempty = queue.empty() #队列为空则True
print(isempty)
queue.join()
end = time.time()
print(end)
for i in range(len(zhuan02)):
line = result_queue.get()
yue01.xiubiao02(line)
print('第{}个'.format(i))
print('总耗时:%s' % (end - start))
print('queue 结束大小 %d' % queue.qsize())
print('result_queue 结束大小 %d' % result_queue.qsize())