#encoding:utf-8
import os
import sys
import urllib2
def regex_content(content):
###content="小燕<哈否\\住宅+在哪里|---*"
#print content
#print len(content)
regexs=['\\','<','>','^','$','*','+','?','{','}','.','|','[',']','!','(',')']
aline=[]
bline=[]
cline=[]
con=""
flag=False
for regex in regexs:
bline=[]
for aa in content:
cline=[]
for bb in aa:
cc=bb.split(regex)
cline+=cc
bline+=cline
aline+=bline
#print aline
#print len(aline)
count=0
for dd in aline:
# if flag==True or count==len(content):
if flag==True:
continue
else:
for regex in regexs:
if flag==True:
continue
elif dd==regex:
flag=True
#print con
break
else:
continue
if flag==False:
con+=dd
print len(con)
str=""
count=0
for ee in con:
if count < len(content):
str+=ee
else:
continue
count=count+1
return str
##read sth form url.txt
filename=sys.argv[1]
url_file=open('%s' % filename,'r')
lines=url_file.readlines()
file_tmp=[]
file_tmpp=[]
file_tmo=[]
url_flag=False
for line in lines:
a_line=line.split('\t\t')
url_name=a_line[0].strip(' \n')
if url_name[0] == '#':
continue
print url_name
#create verify xml
new_file=file('%s/struc_file/%s.verify.xml' % (os.getcwd(),url_name),'a')
urls=a_line[1].split('\t')
#write head and sitename
new_file.write('<?xml version="1.0" encoding="utf-8"?>'+'\n')
new_file.write('<verify>'+'\n')
for url in urls:
##awrapper sth form websites
file_tmp=os.popen('./awrapper_extract_tool -t awrapper_extractor/ -u "%s" -tidy' % url)
#do sth if the url cannot be crawled
count_file=0
file_tmo=[]
for tt in file_tmp:
count_file=count_file+1
file_tmo.append(tt)
print count_file
##if file_tmp[0].find('crawl')>=0 or count_file<=1:
if count_file <= 2:
#wget
print 'cannot crawl the url %s' % url
wrong_file=file('%s/tmp/%s' % (os.getcwd(),url_name),'a')
os.popen('wget --user-agent="gxy" "%s" -O %s/tmp/%s' % (url,os.getcwd(),url_name))
file_tmpp=os.popen('./awrapper_extract_tool -t awrapper_extractor/ -u "%s" -tidy -d %s/tmp/%s' % (url,os.getcwd(),url_name))
file_tmo=file_tmpp
for ll in file_tmpp:
print ll
wrong_file.close()
print count_file
else:
file_tmpp=file_tmp
coo=0
for pp in file_tmpp:
coo=coo+1
#print 'the num of orgin is %s ' % coo
#print "gxy"
##print url
#print data
for rr in file_tmo:
##print "gxy in loop"
##print rr
fields=rr.split(':')
if fields[0].strip()=='url':
url_flag=True
new_file.write('\t'+'<url name="'+url.strip('\n')+'">'+'\n')
elif fields[0].strip()=='author' or fields[0].strip()=='title' or fields[0].strip()=='category' or fields[0].strip()=='source' or fields[0].strip()=='status' or fields[0].strip()=='chapter_name':
new_file.write('\t\t'+'<field name="'+fields[0].strip()+'" value="'+fields[1].strip()+'" verify_type="Equal" />'+'\n')
elif fields[0].strip()=='list_url':
listurl=""
field_name=fields[0]
del fields[0]
listurl=":".join(fields).strip(' \n')
print "list_url is %s\n" % listurl
new_file.write('\t\t'+'<field name="'+field_name.strip()+'" value="'+listurl+'" verify_type="Equal" />'+'\n')
elif fields[0].strip()=='content':
field_name=fields[0].strip()
del fields[0]
contents=":".join(fields).split()
if contents[0]==" ":
del contents[0]
content=" ".join(contents)
con=regex_content(content)
new_file.write('\t\t'+'<field name="content" value=".*'+con.strip('\n')+'.*" verify_type="RegexMatch" />'+'\n')
else:
continue
if url_flag==False:
new_file.write('\t'+url.strip('\n')+'\n')
else:
new_file.write('\t'+'</url>'+'\n')
url_flag=False
new_file.write('</verify>'+'\n')
new_file.close()
print 'create %s.verify.xml succeedly\n' % url_name
url_file.close()
生成verify文件
最新推荐文章于 2021-12-14 09:33:54 发布