python爬虫
import datetime
import random
import time
import uuid
import calendar
import ddddocr
import psycopg2
import self
from bs4 import BeautifulSoup
from psycopg2 import extras
from selenium import webdriver
from selenium.webdriver.common.by import By
import constants
from chaojiying import Chaojiying_Client
while True:
# 捕获js报的异常,使程序不要结束
try:
# 浏览器启动选项
option = webdriver.ChromeOptions()
option.add_argument('window-size=1920x3000') # 指定浏览器分辨率
option.add_argument('--disable-gpu')
option.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面
# 添加启动选项,指定为无界面模式
option.add_argument('--headless')
# option.headless=True 或者将上面的语句换成这条亦可
# 创建Chrome驱动程序的实例
driver = webdriver.Chrome(options=option)
# 创建浏览器对象
# driver = webdriver.Chrome()
# 绕过浏览器的检测地址
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{"source": """Object.defineProperty(navigator,'webdriver',{get:()=>undefined})"""})
# 执行自动化页面操作
driver.get(url='http://112.53.78.78:8400/Login.aspx')
# 获取验证码
img = driver.find_element(By.XPATH,
'/html/body/form/div/div[2]/div[2]/div[1]/table/tbody/tr[3]/td[3]/img').screenshot_as_png
with open('codes.jpg', 'wb') as fp:
fp.write(img)
# 最大化浏览器`
driver.minimize_window()
# 设置浏览器隐式等待,智能化等待
# chrome.implicitly_wait(10)
# 页面显示持续时间
# time.sleep(30)
# 模拟用户名
username = constants.USERNAME
driver.find_element(By.XPATH, '//*[@id="txtUserName"]').send_keys(username)
# 随机生成等待
time.sleep(random.randint(1, 3))
# 模拟密码
password = constants.PASSORED
driver.find_element(By.XPATH, '//*[@id="pwd"]').send_keys(password)
time.sleep(random.randint(1, 3))
# chaojiying = Chaojiying_Client('DrewAds', 'DrewAds', '952773') # 用户中心>>软件ID 生成一个替换 96001
# im = open(r'C:\Users\Administrator\PycharmProjects\pythonProject\codes.jpg',
# 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
#
# pic = chaojiying.PostPic(im, 6001)['pic_str']
# print(pic)
ocr = ddddocr.DdddOcr()
with open('codes.jpg', 'rb') as f:
read = f.read()
name = []
classification = ocr.classification(read)
name.append(classification[0])
name.append(classification[1])
name.append(classification[2])
def device_attribute_key(case):
match case:
case '+':
return int(name[0]) + int(name[2])
case '-':
return int(name[0]) - int(name[2])
case 'r':
return int(name[0]) - int(name[2])
case '1':
return int(name[0]) - int(name[1])
case '6':
return int(name[0]) - int(name[1])
case '8':
return int(name[0]) - int(name[1])
case '5':
return int(name[0]) - int(name[1])
case '2':
return int(name[0]) - int(name[1])
case '3':
return int(name[0]) - int(name[1])
case '4':
return int(name[0]) - int(name[1])
case '7':
return int(name[0]) - int(name[1])
case '9':
return int(name[0]) - int(name[1])
case '0':
return int(name[0]) - int(name[1])
case 'o':
return int(name[0]) - int(name[1])
codes_result = device_attribute_key(name[1])
# print chaojiying.PostPic(base64_str, 1902) #此处为传入 base64代码
# 请查看验证码进行输入(验证码已加载到本地)
# code_name = input('请查看本地图片codes后计算输入,请输入验证码')
# driver.find_element(By.XPATH, '//*[@id="validcode"]').send_keys(code_name)
# driver.find_element(By.XPATH, '//*[@id="validcode"]').send_keys(pic)
driver.find_element(By.XPATH, '//*[@id="validcode"]').send_keys(codes_result)
time.sleep(random.randint(1, 3))
# 进行登录操作
driver.find_element(By.XPATH, '//*[@id="btnLogin"]').click()
time.sleep(random.randint(1, 3))
# 废弃
driver.find_element(By.XPATH, '//*[@id="cc5b660b-aaf0-4a90-83a1-70b2a8ac29e7"]').click()
time.sleep(random.randint(1, 3))
# 实时数据
driver.find_element(By.XPATH, '//*[@id="a418b2b2-2060-4594-92f8-5872025907b0"]').click()
time.sleep(random.randint(1, 3))
# message = driver.find_element(By.XPATH, '//*[@id="7f348e40-c838-41df-8bf7-e4d80e2e0fc2"]').text
# print(message)
# 获取源码数据
page_source = driver.page_source
time.sleep(random.randint(1, 3))
print(page_source)
# 创建解析对象
# bf = BeautifulSoup(page_source, 'lxml')
bf = BeautifulSoup(page_source, 'lxml')
time.sleep(random.randint(1, 3))
print(bf)
# 连接PostgreSQL数据库
conn_data = psycopg2.connect(
host="localhost",
database="up_tb_data",
user="postgres",
password="123456"
)
conn_base = psycopg2.connect(
host="localhost",
database="up_tb_base",
user="postgres",
password="123456"
)
# cursor = db.cursor()
# 创建conn_data游标对象
tb_pro = conn_data.cursor(cursor_factory=extras.RealDictCursor)
# 创建conn_base游标对象
tb_div = conn_base.cursor(cursor_factory=extras.RealDictCursor)
# for i in range(1, 3):
# print(i)
# 获取实时数据的url
key = 1
while key != 40:
srcs = bf.select('#iframetab')[0].attrs.get('src')
print(srcs)
driver.get('http://112.53.78.78:8400/' + srcs)
time.sleep(random.randint(1, 3))
page_text = driver.page_source
time.sleep(random.randint(1, 3))
num = 1
r1 = 1
# 获取设备名的行数
# divers_row = len(driver.find_elements(By.XPATH, '//div[@class="l-grid-body l-grid-body1"]/div/table/tbody/tr'))
# 行数
row = len(
driver.find_elements(By.XPATH, '//div[@class="l-grid-body l-grid-body2 l-scroll"]/div/table/tbody/tr'))
# 列数
lon = len(
driver.find_elements(By.XPATH,
'//div[@class="l-grid-body l-grid-body2 l-scroll"]/div/table/tbody/tr[1]/td'))
for r in range(33, row - 1):
# 获取当前时间戳
time_obj = datetime.datetime.now()
# if r == 1 or r == 17 or r == 32:
if r == 33:
target = driver.find_element(By.XPATH,
'//div[@class="l-grid-body l-grid-body1"]/div/table/tbody/tr[' + str(
r) + ']')
driver.execute_script("arguments[0].scrollIntoView();", target)
print('//div[@clas s="l-grid-body l-grid-body1"]/div/table/tbody/tr[' + str(
r) + ']/td[4]')
name = []
names = driver.find_element(By.XPATH,
'//div[@class="l-grid-body l-grid-body1"]/div/table/tbody/tr[' + str(
r) + ']/td[4]').text
name.append(names)
print(names)
# 根据设备名称查询设备信息
try:
sqlname = (
"""
SELECT * FROM up_tb_base.public.device where name=%s
""")
tb_div.execute(sqlname, (names,))
fetchall_ = tb_div.fetchall()[0]
print(fetchall_)
print(fetchall_["id"])
print(fetchall_["name"])
print(fetchall_["type"])
print(fetchall_["label"])
# 获取设备id
device_id = fetchall_["id"]
# 获取设备名称
device_name = fetchall_["name"]
# 获取设备类型
device_type = fetchall_["type"]
# 获取设备区域
device_label = fetchall_["label"]
# 获取设备id
tenants_id = fetchall_["tenant_id"]
conn_base.commit()
for c in range(2, lon + 1):
res = driver.find_element(By.XPATH,
'//div[@class="l-grid-body l-grid-body2 l-scroll"]/div/table/tbody/tr[' + str(
r) + ']/td[' + str(c) + ']').text.replace('--', '0').replace(' ',
'0')
name.append(res)
# 将数据插入到实时数据表
if name[c - 1] == '0' or name[c - 1] == '' or name[c - 1] == '/':
print('不添加')
else:
def device_attribute_key(case):
match case:
case 2:
return "SO2_concentration_Zs"
case 3:
return "SO2_concentration_Bz"
case 4:
return "SO2_exceeding_Bs"
case 5:
return "NOx_concentration_Zs"
case 6:
return "NOx_concentration_Bs"
case 7:
return "NOx_exceeding_Bs"
case 8:
return "Smoke_concentration_Zs"
case 9:
return "Smoke_concentration_Bz"
case 10:
return "Smoke_exceeding_Bs"
case 11:
return "O2"
case 12:
return "FlowVelocity"
case 13:
return "Humidness"
case 14:
return "Pressure"
case 15:
return "yanqiliuliang"
# case 14:
# return "Excess multiple of sulfur dioxide"
# case 15:
# return "Excess multiple of sulfur dioxide"
try:
# 生成主键id
uu_id = str(uuid.uuid4())
# 获取当前时间
now_time = time_obj.strftime('%Y-%m-%d %H:%M:%S')
# 获取当前时间戳
utc_time = calendar.timegm(time.gmtime())
sql = (
"""
INSERT INTO up_tb_data.public.tb_production(id,device_id,device_name,device_type,device_label,attribute_key,dbl_v,create_time,tenant_id) values(%s,%s,%s,%s,%s,%s,%s,%s,%s)
""")
tb_pro.execute(sql, (
uu_id, device_id, device_name, device_type, device_label, device_attribute_key(c),
name[c - 1],
now_time, tenants_id))
delete_sql = " DELETE FROM up_tb_base.public.attribute_kv WHERE entity_id=%s AND attribute_key=%s"
tb_div.execute(delete_sql, (device_id, device_attribute_key(c)))
insert_sql = "INSERT INTO up_tb_base.public.attribute_kv(entity_type, entity_id, attribute_type, attribute_key, bool_v, str_v, long_v, dbl_v, json_v, last_update_ts) VALUES ('DEVICE', %s, 'CLIENT_SCOPE', %s, 't', NULL, NULL, %s, NULL, %s)"
tb_div.execute(insert_sql, (device_id, device_attribute_key(c), name[c - 1], utc_time))
# update_sql = "UPDATE up_tb_base.public.attribute_kv SET dbl_v=%s,last_update_ts=%s WHERE entity_id=%s AND attribute_key=%s"
# tb_div.execute(update_sql, (name[c - 1], utc_time, device_id, device_attribute_key(c)))
conn_data.commit()
conn_base.commit()
conn_base.commit()
except Exception as e:
print("插入数据库错误,错误信息是:%s", e)
except Exception as e:
print("查询设备基本表,设备名不存在,错误信息是:%s", e)
key += 1
if key == 40:
driver.close()
break
else:
time.sleep(60)
except Exception as e:
print('错误信息为:%s', e)
代码规范
List<String> orderCodeList = request.getOrderCodes();
// 第一层循环 for(String orderCode : orderCodeList){
// 查询单号对应的单据明细
List<OrderItem> items = this.orderItemService.getByCode(orderCode);
// 第二层循环
for(OrderItem item : items) {
// 执行操作1 // 执行操作2 // 执行操作3
// 第三层循环
for()
}
}
List<String> orderCodeList = request.getOrderCodes();
// 第一层循环
for(String orderCode : orderCodeList){ this.processSingleOrder(orderCode); } public void processSingleOrder(String orderCode){
// 查询单号对应的单据明细
List<OrderItem> items = this.orderItemService.getByCode(orderCode); this.processItemsData(items);
}
public void processItemsData(){
for(OrderItem item : items) {
// 执行操作1 // 执行操作2 // 执行操作3
}
}