爬姓名大全网站的姓名-CSDN博客

 
    
         #coding=utf-8 
        
 
         import  
         urllib2 
        
 
         import  
         re 
        
 
         from  
         bs4  
         import  
         BeautifulSoup 
        
 
         import  
         sys 
        
 
         reload 
         (sys)   
        
 
         sys.setdefaultencoding( 
         'utf-8' 
         )  
        
 
         def  
         getHtml(url): 
        
 
             
         page 
         = 
         urllib2.urlopen(url) 
        
 
             
         html 
         = 
         page.read() 
        
 
             
         return  
         html 
        
 
         url 
         = 
         "http://www.yw11.com/html/mi/3-85-0-1.htm" 
        
 
         user_agent 
         = 
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/600.7.12 (KHTML, like Gecko) Version/8.0.7 Safari/600.7.12' 
        
 
         headers 
         = 
         { 
         "User-Agent" 
         :user_agent} 
        
 
         request 
         = 
         urllib2.Request(url,headers 
         = 
         headers) 
        
 
         html 
         = 
         getHtml(request) 
        
 
         # print html 
        
 
         soup 
         = 
         BeautifulSoup(html, 
         'html.parser' 
         ) 
        
 
         divs 
         = 
         soup.find_all( 
         'div' 
         ,attrs 
         = 
         { 
         "class" 
         : 
         "listbox1_text" 
         })[ 
         0 
         ] 
        
 
         ul 
         = 
         divs.find_all( 
         'ul' 
         )[ 
         0 
         ] 
        
 
         lis 
         = 
         ul.find_all( 
         'li' 
         ) 
        
 
         f 
         = 
         open 
         ( 
         'name1.txt' 
         , 
         'a' 
         ) 
        
 
         for  
         index  
         in  
         range 
         ( 
         len 
         (lis)): 
        
 
              
         # print lis[index].text 
        
 
              
         name 
         = 
         lis[index].text.lstrip() 
        
 
              
         f.write(name) 
        
 
              
         f.write( 
         '\r\n' 
         ) 
        
 
         print  
         "抓取了" 
         + 
         str 
         (index) 
         + 
         "个名字" 
        
 
         f.close() 
        
 
         f 
         = 
         open 
         ( 
         'name1.txt' 
         , 
         'r' 
         ) 
        
 
         lines 
         = 
         f.readlines() 
        
 
         print  
         "当前一共有" 
         + 
         str 
         ( 
         len 
         (lines)) 
        
 
         f.close() 
        
 
  

上面的程序是抓取网站起名网站

 
         http: 
         / 
         / 
         www.yw11.com 
         / 
         namelist.php

的名字的，点开每个姓，可以查看要抓的数据的格式,分析一下数据的格式就可以很好的用BS匹配出来了。

 
         <div  
         class 
         = 
         "listbox1_text" 
         > 
        
         <ul> 
        
         <li> 
        
         刘佳乐< 
         / 
         li> 
        
         <li> 
        
         刘慧娴< 
         / 
         li> 
        
         <li> 
        
         刘嘉源< 
         / 
         li> 
        
         <li> 
        
         刘建成< 
         / 
         li> 
        
         <li> 
        
         刘艾佳< 
         / 
         li> 
        
         。。。。。。。。。。。。。。。。。。。 
        
         刘威铭< 
         / 
         li> 
        
         <li> 
        
         刘焕军< 
         / 
         li> 
        
         <li> 
        
         刘舒锦< 
         / 
         li> 
        
         <li> 
        
         刘瑾炎< 
         / 
         li> 
        
         <li> 
        
         刘瑾昭< 
         / 
         li> 
        
         < 
         / 
         ul> 
        
         <div  
         class 
         = 
         "clear" 
         > 
        
         &nbsp;< 
         / 
         div> 
        
         < 
         / 
         div>