初学者还是用bs4+urllib比较好的简单爬虫
- #coding=utf-8
- import sys
- import time
- import urllib
- import urllib2
- import requests
- import numpy as np
- from bs4 import BeautifulSoup
- reload(sys)
- sys.setdefaultencoding('utf8')
- out=open("foodlist","w+")
- def pachong(tag):
- count=1
- while 1:
- url="http://home.meishichina.com/search/"+urllib.quote(tag)+"/"
- tail="page/"+str(count)+"/"
- if count!=1:url+=tail
- time.sleep(np.random.rand()*2)
- try:
- print url
- req=urllib2.Request(url)
- source_code=urllib2.urlopen(req).read()
- plain_txt=str(source_code)
- except (urllib2.HTTPError,urllib2.URLError),e:
- print e
- continue
- soup=BeautifulSoup(plain_txt)
- list_soup=soup.find_all('div',{'class':'detail'})
- if len(list_soup)==0:break
- for i in range(len(list_soup)):
- res=list_soup[i].find('h4')
- title=res.text
- res=list_soup[i].find('p',{'class':'subcontent'})
- descrip=res.text
- res=list_soup[i].find('div',{'class':'left'})
- if res.text=="":myfrom="无"
- else:myfrom=res.text.strip()
- out.write(str(title)+'\t'+str(descrip)+'\t'+myfrom+'\n')
- count+=1
- pachong("火锅")