Python Web数据抓取(xpath版)

http://www.redicecn.com/html/blog/

这个版本较之前的“正则表达式版”而言,主要有以下几个改进:

(1)采用SQLite缓存抓取的HTML页面,大大提高了二次数据处理的效率。第一次运行程序大约耗时6小时,以后只需3分钟左右即可完成。
(2)采用xpath替换之前的正则表达式进行HTML解析。xpath定位更加简单、方便,而且能够自动修正html错误语法。xpath真强大!!
(3)去掉了重复的结果。


程序代码如下:

  1. # coding:utf-8  
  2. # Practice of scraping web data with xpath  
  3. # by redice 2010.11.05  
  4.   
  5. import codecs  
  6. import sys    
  7. reload(sys)    
  8. sys.setdefaultencoding('utf-8')  
  9.   
  10. import urllib2  
  11. from urllib2 import URLError, HTTPError  
  12. import zlib  
  13. import sqlite3  
  14.   
  15. try:  
  16.     import cPickle as pickle  
  17. except ImportError:  
  18.     import pickle  
  19.       
  20.   
  21. conn = sqlite3.connect("html_cache.db")  
  22. conn.text_factory = lambda x: unicode(x, 'utf-8''replace')  
  23. curs = conn.cursor()  
  24.   
  25. #if htmls tables not exist,create it  
  26. #curs.execute('''CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content BLOG,size INTEGER);''')  
  27. curs.execute('''''CREATE TABLE if not exists htmls(url VARCHAR(255) UNIQUE,content TEXT,size INTEGER);''')  
  28. conn.commit()  
  29.   
  30. def serialize(value):  
  31.     """convert object to a compressed pickled string to save in the db 
  32.     """  
  33.     #return sqlite3.Binary(zlib.compress(pickle.dumps(value, protocol=pickle.HIGHEST_PROTOCOL), 5))  
  34.     #return sqlite3.Binary(value)  
  35.     return value  
  36.       
  37. def deserialize(value):  
  38.     """convert compressed pickled string from database back into an object 
  39.     """  
  40.     #return pickle.loads(zlib.decompress(value)) if value else value  
  41.     return value  
  42.   
  43. # Fetch the target html  
  44. def gethtml(url):  
  45.     '''''Fetch the target html'''  
  46.     try:  
  47.         # look up the html_cache.db first  
  48.   
  49.         curs.execute("select * from htmls where url=?;" ,(url,))  
  50.         row = curs.fetchone()  
  51.         if row:  
  52.             # find the target  
  53.             #print deserialize(str(row[1]))  
  54.             return deserialize(str(row[1]))  
  55.   
  56.         response = urllib2.urlopen(url)  
  57.         result = response.read()  
  58.         # insert into the html_cache.db  
  59.         curs.execute("insert into htmls values(?,?,?);", (url,serialize(result),len(result)))  
  60.         conn.commit()  
  61.           
  62.         print "saved %s into html_cache.db" % (url)  
  63.           
  64.         return  result  
  65.     except URLError, e:  
  66.         if hasattr(e, 'reason'):  
  67.             print 'Failed to reach a server.'  
  68.             print 'Reason: ', e.reason  
  69.             return 'None'  
  70.         elif hasattr(e, 'code'):  
  71.             print 'The server couldn't fulfill the request.'  
  72.             print 'Error code: ', e.code  
  73.             return 'None'  
  74.     #except:  
  75.         #return 'None'  
  76.       
  77. # end def gethtml  
  78.   
  79.   
  80. import re  
  81.   
  82. #Fetch the all string matched. Return a list.  
  83. def regexmatch(rule,str):  
  84.     '''''Fetch the all string matched. Return a list.'''  
  85.     p = re.compile(rule)  
  86.     return p.findall(str)  
  87. #end def regexmatch  
  88.   
  89.   
  90. # decodeHtmlEntity  
  91. def decodeHtmlEntity(s) :  
  92.     '''''decodeHtmlEntity'''  
  93.     if s=='' or not s:  
  94.        return ''  
  95.     result = s  
  96.       
  97.     import locale  
  98.     result = result.decode(locale.getdefaultlocale()[1],"ignore").encode(locale.getdefaultlocale()[1]).replace("xc2xa0"," ")  
  99.       
  100.     return result  
  101. # end def decodeHtmlEntity  
  102.   
  103. #final result  
  104. dining_db = []  
  105.   
  106. total = 0;  
  107.   
  108. #debug  
  109. debug = 0  
  110.   
  111. # Fetch menupalace.com's html  
  112. print 'Fetching html from http://menupalace.com ...'  
  113. html = gethtml('http://menupalace.com')  
  114.   
  115. from lxml import etree  
  116.   
  117. if html=='' or  html=='None':  
  118.     print "Can't get them html from http://menupalace.com"  
  119.     sys.exit()  
  120.   
  121. try:  
  122.     tree = etree.HTML(html)  
  123.     nodes = tree.xpath("//table[@class='n_table']")  
  124. except:  
  125.     f = open("log.txt","wa")  
  126.     f.write(html)  
  127.     print("error to resolve the html http://menupalace.com")  
  128.     sys.exit()  
  129.   
  130. for node in nodes:  
  131.     if debug and total>=10:  
  132.         break;  
  133.   
  134.     n = node.xpath("./tr[1]/td[1]/img")  
  135.     # Fetch country  
  136.     country = ""  
  137.     if len(n)>0:  
  138.         country = decodeHtmlEntity(n[0].tail)  
  139.         country = country.strip()  
  140.   
  141.     # Fetch all link      
  142.     ls = node.xpath(".//a")  
  143.   
  144.     # Through all link  
  145.     for l in ls:  
  146.         if debug and total>=10:  
  147.             break;  
  148.           
  149.         #city  
  150.         city = decodeHtmlEntity(l.text)  
  151.         city = city.strip()  
  152.           
  153.         prelink = l.get("href")  
  154.         link = prelink + "restaurants/restaurants.aspx"  
  155.   
  156.         #print 'Fetching html from '+ link +' ...'  
  157.         html = gethtml(link)  
  158.         if html=='' or html == 'None':  
  159.             print "Can't get them html from " + link  
  160.             continue  
  161.           
  162.         try:  
  163.             subtree = etree.HTML(html)  
  164.             subnodes = subtree.xpath("//td[@class='frame_style_padding']")  
  165.         except:  
  166.             if debug:  
  167.                 f = open("log.txt","wa")  
  168.                 f.write(html)  
  169.                 print("error to resolve the html " + link)  
  170.                 sys.exit()  
  171.             else:  
  172.                 continue  
  173.               
  174.         for sn in subnodes:  
  175.             if debug and total>=10:  
  176.                 break;  
  177.                               
  178.             sls = sn.xpath(".//a")  
  179.             for sl in sls:  
  180.                 if debug and total>=10:  
  181.                     break;  
  182.                               
  183.                 link = prelink + "restaurants/" + sl.get("href")  
  184.   
  185.                 print 'Fetching html from '+ link +' ...'                  
  186.                 html = gethtml(link)  
  187.                 if  html=='' or html == 'None':  
  188.                     print "Can't get them html from " + link  
  189.                     continue  
  190.                   
  191.                 try:  
  192.                     sstree = etree.HTML(html)  
  193.                     ssnodes = sstree.xpath("//table[@width='94%'][@height='80px']")  
  194.                 except:  
  195.                     if debug:  
  196.                         f = open("log.txt","wa")  
  197.                         f.write(html)  
  198.                         f.write(" ")  
  199.                         print("error to resolve the html" + link)  
  200.                         sys.exit()  
  201.                     else:  
  202.                         continue  
  203.                       
  204.                 for ssn in ssnodes:  
  205.                     if debug and total>=10:  
  206.                        break;  
  207.                               
  208.                     #name  
  209.                     n = ssn.xpath(".//tr[1]/td[1]/a[1]")  
  210.                     name = ''  
  211.                       
  212.                     if len(n)>0:  
  213.                         name = decodeHtmlEntity(n[0].text)  
  214.                         name = name.strip()  
  215.                         #print name  
  216.   
  217.                     #address  
  218.                     n = ssn.xpath(".//tr[2]/td[1]")  
  219.   
  220.                     #address array                      
  221.                     address_arr =[]                      
  222.                     address = ''  
  223.                     state = ''  
  224.   
  225.                     if len(n)>0:  
  226.                         address = decodeHtmlEntity(n[0].text)  
  227.                         #has many locations  
  228.                           
  229.                         if address.strip()=='Various Locations':  
  230.                             n = ssn.xpath(".//tr[1]/td[1]/div[1]/span[1]")  
  231.                             if len(n)>0:  
  232.                                   
  233.                                address = decodeHtmlEntity(n[0].text)  
  234.                                addrlist = address.split()  
  235.                                if len(addrlist)>4:  
  236.                                     state = addrlist[-2]  
  237.                                     city = addrlist[-3]  
  238.                                     #remove state and city from the address  
  239.                                     address = address.replace(state,'')  
  240.                                     address = address.replace(city,'')  
  241.                                     address = address.replace(addrlist[-1],'')  
  242.                                     address = address.strip()  
  243.                                     address_arr.append((address,city,state))  
  244.                                       
  245.   
  246.                                     brn = ssn.xpath(".//tr[1]/td[1]/div[1]/span[1]/br")  
  247.                                     for n in brn:  
  248.                                         address = decodeHtmlEntity(n.tail)  
  249.                                         addrlist = address.split()  
  250.                                         if len(addrlist)>4:  
  251.                                             state = addrlist[-2]  
  252.                                             city = addrlist[-3]  
  253.                                             #remove state and city from the address  
  254.                                             address = address.replace(state,'')  
  255.                                             address = address.replace(city,'')  
  256.                                             address = address.replace(addrlist[-1],'')  
  257.                                             address = address.strip()  
  258.                                             address_arr.append((address,city,state))  
  259.                                   
  260.                             else:  
  261.                                 address_arr.append(('','',''))  
  262.                         else:           
  263.                             addrlist = address.split()  
  264.                             if len(addrlist)>3:  
  265.                                 state = addrlist[-1]  
  266.                                 city = addrlist[-2]  
  267.                                 #remove state and city from the address  
  268.                                 address = address.replace(state,'')  
  269.                                 address = address.replace(city,'')  
  270.                                 address = address.strip()  
  271.                                 address_arr.append((address,city,state))  
  272.   
  273.                     #website  
  274.                     website = ''  
  275.                     n = ssn.xpath(".//tr[3]/td[1]/a[1]")  
  276.                     if len(n)>0:  
  277.                         website = decodeHtmlEntity(n[0].text)  
  278.                         website = website.strip()  
  279.   
  280.   
  281.                     if name and len(address)>0:  
  282.                         for addr in address_arr:  
  283.                              dining = {}  
  284.                              dining['name'] = name  
  285.                              if addr[0] == 'Various Locations':  
  286.                                  dining['address'] = ''  
  287.                              else:  
  288.                                  dining['address'] = addr[0]  
  289.                              dining['city'] = addr[1]  
  290.                              dining['state'] = addr[2]  
  291.                              dining['country'] = country  
  292.                              dining['website'] = website  
  293.   
  294.                              # Avoid duplication  
  295.                              if not (dining in dining_db):  
  296.                                  dining_db.append(dining)  
  297.                                  total = total + 1  
  298.   
  299.                              if debug and total>=10:  
  300.                                  break;  
  301.                       
  302.   
  303.                   
  304.   
  305. #Close database link  
  306. conn.close()  
  307.          
  308. #print and save the final result  
  309. import csv  
  310. cf = open("scraping_result.csv""w")  
  311. writer = csv.writer(cf)  
  312. writer.writerow(['name','address','city','state','country','website'])  
  313.   
  314. for  item in dining_db:  
  315.      #print item['name'],item['address'],item['city'],item['state'],item['country'],item['website']  
  316.      rlist=[]  
  317.      rlist.append(item['name'])  
  318.      rlist.append(item['address'])  
  319.      rlist.append(item['city'])  
  320.      rlist.append(item['state'])  
  321.      rlist.append(item['country'])  
  322.      rlist.append(item['website'])  
  323.      writer.writerow(rlist)  
  324.   
  325. cf.close()  
  326.   
  327.   
  328. print 'The result has been saved into scraping_result.csv!'  




部分结果:


源码下载:

File: Click to Download

 


  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值