#coding=utf-8
from pymongo importMongoClientfrom lxml importetreeimportrequests
s= [u'标准编号:',u'发布单位:',u'发布日期:',u'状态:',u'实施日期:',u'开本页数:',u'采用关系:',
u'中图分类号:',u'中国标准分类号:',u'国际标准分类号:',u'国别:',u'关键词:',u'摘要:']#获取数据库
defget_db():
client= MongoClient('IP', 27017)
db=client.wanfang
db.authenticate("用户名","密码")returndb#获取第num条数据
defget_data(table, num):
i= 1
for item in table.find({}, {"content":1,"_id":0}):if i==num:if item.has_key('content') and item['content']:return item['content']else:
i+=1
continue
#列表转字符串
deflist_str(list):if len(list)!=0:returnlist[0]else:return ""
#提取分类号
defcode_ls(list):if len(list)!=0:
ls=list[0].split()
shanchu=[]for i inls:if ("("in i) or (")"in i) or ("("in i) or(")"ini):
shanchu.append(i)for i inshanchu:
ls.remove(i)returnlselse:return ""
#构造关键词列表
defkeywords_ls(list):if len(list)!=0:returnlistelse:return ""
#替代标准
defreplace_str(replace):if replace!="":
ls= [i.strip().replace(" ", "") for i inreplace]if len(ls)!=0:return ls[0][5:]else:return ""
else:return ""
#提取摘要
defsummary_str(list):if len(list)!=0:if list[0][0]!="<":returnlist[0]else:return ""
else:return ""
#调整日期格式
defdate_str(list):if len(list)!=0:
year= list[0].find(u'年')
month= list[0].find(u'月')
day= list[0].find(u'日')if month-year==2:
list[0]= list[0].replace(u"年",u"年0")if day-month==2:
list[0]= list[0].replace(u"月",u"月0")return list[0].replace(u"日","").replace(u"月","-").replace(u"年","-")else:return ""
#调整采标格式
defadopted_ls(string, ls):
dc={}
loc= string.find(',')if loc==-1:returnlselse:
dc["code"] =string[:loc].strip()
dc["type"] = string[loc+1:loc+4]
ls.append(dc)return adopted_ls(string[loc+4:],ls)#构造标准入库字典
defstandard_dict(html):
dc={}
tree=etree.HTML(html)#标准名称
dc["title"] = list_str(tree.xpath("//h1/text()"))#外文名称
dc["title_eng"] = list_str(tree.xpath("//h2/text()"))#标准编号
dc["standard_number"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[0])))#发布单位
dc["publishing_department"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[1])))#发布日期
dc["release_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[2])))#状态
dc["state"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[3])))#实施日期
dc["enforcement_date"] = date_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[4])))#开本页数
dc["pages"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[5])))#采用关系
dc["adopted"] = adopted_ls(list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[6]))), [])#中图分类号
dc["clc"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[7])))#中国标准分类号
dc["ccs"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[8])))#国际标准分类号
dc["ics"] = code_ls(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[9])))#国别
dc["country"] = list_str(tree.xpath("//span[text()='%s']/following-sibling::*/text()"%(s[10])))#关键词
dc["keywords"] = keywords_ls(tree.xpath("//span[text()='%s']/following-sibling::*/child::*/text()"%(s[11])))#摘要
dc["summary"] = summary_str(tree.xpath("//span[text()='%s']/parent::*/following-sibling::*/text()"%(s[12])))#替代标准
dc["replace_for"] = replace_str(tree.xpath("//div[@id='replaceStandard']//child::*//text()"))returndc#主函数
defmain():
db=get_db()
collection=db.standard
collection2=db.standard_cleanedfor item in collection.find({}, {"content":1,"_id":0}):if item.has_key('content') and item['content']:
dc= standard_dict(item['content'])
collection2.insert(dc)if __name__ == '__main__':
main()#以下代码用于测试清洗特定一条数据
#db = get_db()
#collection=db.standard
#collection2 = db.standard_cleaned
#data = get_data(collection, 8)
#dc = standard_dict(data)
#collection2.insert(dc)
#for k,v in dc.items():
#print k,v
## 以下代码用于测试提取摘要
#data = requests.get('http://d.wanfangdata.com.cn/Standard/ISO%208528-5-2013')
#dc = standard_dict(data.text)
#for k,v in dc.items():
#print k,v
## 以下代码用于测试修改日期格式
#l1 = [u"2017年6月28日"]
#l2 = [u"2017年10月27日"]
#l3 = [u"2017年12月1日"]
#l4 = [u"2017年7月1日"]
#print date_str(l1)
#print date_str(l2)
#print date_str(l3)
#print date_str(l4)