词语相似度计算：3、使用urllib爬取wiki文章，使用beautifulSoup解析html-CSDN博客

本文链接：https://blog.csdn.net/mmc2015/article/details/50943043

详细介绍参考：

http://blog.csdn.net/mmc2015/article/details/50923309

完整代码供大家参考。。。。

[python]view plain copy 
   
 
 #!usr/bin/env  
 # -*-coding:utf-8 -*-  
   
   
   
 import pandas as pd  
 import numpy as np  
   
 import urllib, urllib2  
 import re  
 from bs4 import BeautifulSoup  
   
   
 import sys  
 reload(sys)  
 sys.setdefaultencoding("utf8")  
 #for UnicodeEncodeError  
   
   
   
 def SaveFile(content, filename):  
     f=open("wikiData/"+filename,"a")  
     f.write(str(content)+"\n")  
     f.close()  
   
   
 def SpideWiki(words):  
     user_agent='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'  
     headers={'User-Agent':user_agent}  
     try:  
         for i in range(len(words)):  
             url="https://en.wikipedia.org/wiki/"+words[i]  
             request=urllib2.Request(url, headers=headers)  
             response=urllib2.urlopen(request)  
             wikiHtml=response.read().decode('utf-8')  
             html=BeautifulSoup(str(wikiHtml),"lxml")  
             div=html.find(name='div', id='mw-content-text')  
             ps=div.find_all(name='p', limit=3, recursive=False) #only direct children  
             for p in ps:  
                 pText=p.get_text()  
                 SaveFile(pText, words[i])  
             print words[i], "process over...", "=="*20  
     except urllib2.URLError, e:  
         if hasattr(e,"code"):  
             print e.code  
         if hasattr(e,"reason"):  
             print e.reason