全国主要流域重点断面水质自动监测周报pdf使用python提取_重点流域自动监测断面水质数据-CSDN博客

全国主要流域重点断面水质自动监测周报主要包含分类、序号、采集数据日期、点位名称、周期、PH、DO、CODMn、NH3-N、本周水质、上周水质、主要污染指标，发布时间。

周报数据源：

中国环境监测总站(此链接数据较全，包含2006年-2018年的水质自动监测周报数据) http://webinterface.cnemc.cn/csszzb2093030.jhtml
中国环境监测总站(目前找到的唯一可用数据源，包含2007年-2018年的水质自动监测周报数据) http://www.cnemc.cn/sssj/szzdjczb/index.shtml

上述数据源中数据为PDF，在ELT入库过程中，数据抽取模块需要将PDF这样的非结构化数据抽取并转换成excel/csv，以方面后续处理。

PDF周报数据的抽取难点在于PDF中表格只是打印助记符，而不像HTML这样的包含起始标签，行列等，处理起来较为麻烦。

测试了几款python库将PDF转为excel, 均有各种问题，比如pdf2docx，camelot，tabula，最后发现pdfplumber处理效果较好，可以较完整地处理合并的单元格，翻页，复杂表头等情况，附上主要处理代码：

#-#! /usr/bin/python
# -*- coding: UTF-8 -*-
 
from pymysql_lib import UsingMysql
import os,shutil,re,time,datetime
from os.path import exists
import pdfplumber
 
def check_it():
 
    with UsingMysql(log_time=True) as um:
        #um.exec("truncate table mda_aqi_air_hourly_fact_tmp")
        for root, dirs, files in os.walk(os.getcwd()+"\\incoming"):  #
            for file in files:
                if file.endswith(".pdf") and not exists("processed\\"+file):# and file == "2008年第28期.pdf":
                    bValid = 0
                    print ('\nProcessing: '+file)
 
                    pdf = pdfplumber.open("incoming\\"+file)
 
                    data = []
                    bHeaderFound = 0
 
                    for index in range(len(pdf.pages)):  
                        if index == 0:
                            summary = pdf.pages[0].extract_text()
                            #print(summary)
 
                        page = pdf.pages[index]
                        #print(page.extract_tables())
                        for table in page.extract_tables():
                            for row in table:
                                if ((row[0] == '序\n号' or row[0] == '序号') and bHeaderFound == 1) or row[0] == None or (not row[0].isdigit() and index != 1 and index != 0):
                                    continue
 
                                if row[0] == '序号' or row[0] == '序\n号':
                                    bHeaderFound = 1
 
                                data.append(row)
 
                                #print(row)
 
                    lastCol1 = ""
                    lastCol2 = ""
 
                    bDataValid = 1
                    bWithoutRiver = 0
 
                    if len(data[0]) == 11 and data[0][2] == "点位名称":
                        bWithoutRiver = 1
                        data[0].insert(2, "河流名称")
 
                    for rowId in range(len(data)):
                        if len(data[rowId]) == 11 and bWithoutRiver == 1:
                            data[rowId].insert(2, "")
 
                        if len(data[rowId]) != 12:
                            bDataValid = 0
                            print(data[rowId])
                            break
 
                        if data[rowId][1] != None:
                            data[rowId][1] = data[rowId][1].replace('\n','')
 
                        if data[rowId][1] != None and len(data[rowId][1]) != 0:
                            lastCol1 = data[rowId][1]
 
                        if data[rowId][2] != None and len(data[rowId][2]) != 0:
                            lastCol2 = data[rowId][2]
 
                        if data[rowId][1] == None or len(data[rowId][1]) == 0:
                            data[rowId][1] = lastCol1
 
                        if data[rowId][2] == None or len(data[rowId][2]) == 0:
                            data[rowId][2] = lastCol2
 
                        for d in data[rowId]:
                            if d == None and rowId != 0:
                                bDataValid = 0
                                print(data[rowId])
                                break    
                         
                        if bDataValid == 0:
                            break
 
                    pdf.close()
 
                    #'序\n号', '水系', '河流名称', '点位名称', '断面情况', '评价因子（单位：mg/L）', None, None, None, '水质类别', None, '主要污染指标'
                    if bDataValid == 1 and len(data) > 0 and len(data[0]) == 12 and (data[0][0] == "序\n号" or data[0][0] == "序号")   \
                         and (data[0][1] == "水系" or data[0][1] == "流域") and data[0][2] == "河流名称" and (data[0][3] == "点位名称" or data[0][3] == "断面（点位）名称") and (data[0][4] == "断面情况" or data[0][4] == "断面（点位）情况") \
                         and (data[0][5] == "评价因子（单位：mg/L）" or data[0][5] == "评价因子（单位：\nmg/L）") and data[0][6] == None and data[0][7] == None\
                         and data[0][8] == None and (data[0][9] == "水质类别" or  data[0][9] == "水质别") and data[0][10] == None \
                         and data[0][11] == "主要污染指标":
 
                        if loadData(summary, data, file, um) == 1:
                            bValid = 1
 
                    else:
                        print("unexpceted: "+file+", "+str(len(data))+", "+str(bDataValid))  
                        if len(data) > 0 : 
                            print(data[0])
 
                    if bValid == 1 :
                        shutil.move("incoming\\"+file, "processed\\"+file)
                        print ('Processed: '+file)
 
if __name__ == '__main__':
    check_it()