生成verify文件

#encoding:utf-8
import os
import sys
import urllib2


def regex_content(content):


  ###content="小燕<哈否\\住宅+在哪里|---*"
  #print content
  #print len(content)
  regexs=['\\','<','>','^','$','*','+','?','{','}','.','|','[',']','!','(',')']
  aline=[]
  bline=[]
  cline=[]
  con=""
  flag=False


  for regex in regexs:
        bline=[]
        for aa in content:
                     cline=[]
                     for bb in aa:
                            cc=bb.split(regex)
                            cline+=cc
                     bline+=cline
        aline+=bline
  #print aline
  #print len(aline)
  count=0
  for dd in aline:
     # if flag==True or count==len(content):
      if flag==True: 
        continue
      else:
        for regex in regexs:
           if flag==True:
             continue
           elif dd==regex:
             flag=True
             #print con
             break
           else:
             continue
      if flag==False:
        con+=dd
  print len(con) 
  str=""
  count=0
  for ee in con:
      if count < len(content):
         str+=ee
      else:
         continue
      count=count+1
  return str       








##read sth form url.txt
filename=sys.argv[1]
url_file=open('%s' % filename,'r')
lines=url_file.readlines()
file_tmp=[]
file_tmpp=[]
file_tmo=[]
url_flag=False
for line in lines:
      a_line=line.split('\t\t')
      url_name=a_line[0].strip(' \n')
      if url_name[0] == '#':
            continue      
      print url_name
      #create verify xml
      new_file=file('%s/struc_file/%s.verify.xml' % (os.getcwd(),url_name),'a')
      urls=a_line[1].split('\t')
      
      #write head and sitename
      new_file.write('<?xml version="1.0" encoding="utf-8"?>'+'\n')
      new_file.write('<verify>'+'\n')
      
      
      for url in urls:
      ##awrapper sth form websites
         file_tmp=os.popen('./awrapper_extract_tool -t awrapper_extractor/ -u "%s" -tidy' % url)
         #do sth if the url cannot be crawled
         count_file=0
         file_tmo=[]
         for tt in file_tmp:
              count_file=count_file+1
              file_tmo.append(tt) 
         print count_file
         ##if file_tmp[0].find('crawl')>=0 or count_file<=1:
         if count_file <= 2:
              #wget 
              print 'cannot crawl the url %s' % url
              wrong_file=file('%s/tmp/%s' % (os.getcwd(),url_name),'a')
              os.popen('wget --user-agent="gxy" "%s" -O %s/tmp/%s' % (url,os.getcwd(),url_name))
               
              file_tmpp=os.popen('./awrapper_extract_tool -t awrapper_extractor/ -u "%s" -tidy -d %s/tmp/%s' % (url,os.getcwd(),url_name))
              file_tmo=file_tmpp
              for ll in file_tmpp:
                   print ll
              wrong_file.close()              
              print count_file
         else:
              file_tmpp=file_tmp
              coo=0
              for pp in file_tmpp:
                  coo=coo+1
              #print 'the num of orgin is %s ' % coo
         #print "gxy"
         
         ##print url
              #print data
         for rr in file_tmo:
              ##print "gxy in loop"
              ##print rr
              fields=rr.split(':')
              if fields[0].strip()=='url':
                 url_flag=True
                 new_file.write('\t'+'<url name="'+url.strip('\n')+'">'+'\n')
              elif fields[0].strip()=='author' or fields[0].strip()=='title' or fields[0].strip()=='category' or fields[0].strip()=='source' or fields[0].strip()=='status' or fields[0].strip()=='chapter_name':
                 new_file.write('\t\t'+'<field name="'+fields[0].strip()+'" value="'+fields[1].strip()+'" verify_type="Equal" />'+'\n')
              elif fields[0].strip()=='list_url':


                 listurl=""
                 field_name=fields[0]
                 del fields[0]
                 listurl=":".join(fields).strip(' \n')
                 print "list_url is %s\n" % listurl
                 new_file.write('\t\t'+'<field name="'+field_name.strip()+'" value="'+listurl+'" verify_type="Equal" />'+'\n')
              elif fields[0].strip()=='content':
                 field_name=fields[0].strip()
                 del fields[0]
                 contents=":".join(fields).split()
                 if contents[0]==" ":
                      del contents[0]
                 content=" ".join(contents)
                 con=regex_content(content)
                 new_file.write('\t\t'+'<field name="content" value=".*'+con.strip('\n')+'.*" verify_type="RegexMatch" />'+'\n')
              else:
                 continue
         if url_flag==False:
                new_file.write('\t'+url.strip('\n')+'\n')
         else:
                new_file.write('\t'+'</url>'+'\n')
                url_flag=False
      new_file.write('</verify>'+'\n')
      new_file.close()
      print 'create %s.verify.xml succeedly\n' % url_name
url_file.close()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值