代码实例:
# -*- coding: utf-8 -*-
#Author:qiang
import re
import os
import csv
class HTML_Process(object):
def __init__(self, page):
self.page = page
self.record_dict, self.record_arr = self.deal_page()
self.save_data()
print "\nfinish"
#save data
def save_data(self):
output_file_object=open("output_file.txt","wb")
writer=csv.writer(output_file_object)
header=["用户编号","用户类型","计量点名称","资产编号","出厂编号","示数类型","上次示数","本次示数","综合倍率",
"上次抄见电量","本次电量","抄表状态","抄表异常分类","抄表数据来源","用电地址"]
writer.writerow(header)
for line in self.record_arr:
writer.writerow(line)
output_file_object.close()
#deal page and return segment data
def deal_page(self):
#find all records
page_patt = re.compile("green..href=.javascript:queryConsInfo(.*?)<a")
records = page_patt.findall(self.page)
#compile the pattern for record
record_patt = re.compile("<td noWrap>([^<].*?)</td>")
user_number_patt = re.compile("\).>(.*?)</a>")
count = 0
record_dict = []
record_arr = []
for record in records:
print record
#get the user number
user_number = user_number_patt.findall(record)
arr = record_patt.findall(record)
result_arr = user_number + arr
#catch 15 segments data
if len(result_arr) == 15:
one_record_dict = {}
one_record_dict["user_number"] = result_arr[0]#用户编号
one_record_dict["user_type"] = result_arr[1]#用户类型
one_record_dict["meter_point_name"] = result_arr[2]#计量点名称
one_record_dict["asset_number"] = result_arr[3]#资产编号
one_record_dict["factory_number"] = result_arr[4]#出厂编号
one_record_dict["display_type"] = result_arr[5]#示数类型
one_record_dict["last_display"] = result_arr[6]#上次示数
one_record_dict["current_display"] = result_arr[7]#本次示数
one_record_dict["comprehensive_rate"] = result_arr[8]#综合倍率
one_record_dict["last_power"] = result_arr[9]#上次抄见电量
one_record_dict["current_power"] = result_arr[10]#本次电量
one_record_dict["status"] = result_arr[11]#抄表状态
one_record_dict["abnormal_assort"] = result_arr[12]#抄表异常分类
one_record_dict["data_from"] = result_arr[13]#抄表数据来源
one_record_dict["user_address"] = result_arr[14]#用电地址
record_dict.append(one_record_dict)
record_arr.append(result_arr)
#print the data to the screen
count = count + 1
strs = ""
for i in result_arr:
strs = strs + str(i) + "-"
print strs
print "total records :" + str(count)
if count==0:
print "maybe will modify regular expression or check the page is ok"
return record_dict, record_arr
def get_Page():
try:
file_object = open("monInfo.txt")
page = file_object.read()
file_object.close()
return page
except:
print "no [monInfo.txt] file in current folder"
if __name__ == "__main__":
try:
page=get_Page()
HTML_Process(page)
except:
pass
os.system("pause")