详细介绍参考:
http://blog.csdn.net/mmc2015/article/details/50923309
完整代码供大家参考。。。。
- #!usr/bin/env
- # -*-coding:utf-8 -*-
- import pandas as pd
- import numpy as np
- import urllib, urllib2
- import re
- from bs4 import BeautifulSoup
- import sys
- reload(sys)
- sys.setdefaultencoding("utf8")
- #for UnicodeEncodeError
- def SaveFile(content, filename):
- f=open("wikiData/"+filename,"a")
- f.write(str(content)+"\n")
- f.close()
- def SpideWiki(words):
- user_agent='Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
- headers={'User-Agent':user_agent}
- try:
- for i in range(len(words)):
- url="https://en.wikipedia.org/wiki/"+words[i]
- request=urllib2.Request(url, headers=headers)
- response=urllib2.urlopen(request)
- wikiHtml=response.read().decode('utf-8')
- html=BeautifulSoup(str(wikiHtml),"lxml")
- div=html.find(name='div', id='mw-content-text')
- ps=div.find_all(name='p', limit=3, recursive=False) #only direct children
- for p in ps:
- pText=p.get_text()
- SaveFile(pText, words[i])
- print words[i], "process over...", "=="*20
- except urllib2.URLError, e:
- if hasattr(e,"code"):
- print e.code
- if hasattr(e,"reason"):
- print e.reason