抓起大众点评的用户评论
#encoding='UTF-8'
__author__ = 'Administrator'
import sys,urllib.request
sys.path.append('./')
import sql
#import re,time
import time
from bs4 import BeautifulSoup
#------------------------------------------------------
def Mysqls():
return sql.Mysql('127.0.0.1','root','123456','test_msccms')
#------------------------------------------------------
class dianping:
def __init__(self):
self.names=''
self.cturl=[]
self.ctname=[]
self.ctaddr=[]
self.users=[]
self.datas=[]
self.tms=[]
def get_ct_url(self,htmlurl):
self.htmlurl=htmlurl
headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
htmlline = opener.open(self.htmlurl).read()
#page=urllib.request.urlopen(self.htmlurl)
#htmlline = page.read()
#soup=BeautifulSoup(htmlline,"html.parser",from_encoding="UTF-8")
soup=BeautifulSoup(htmlline,"html.parser")
self.names=soup.span.string
print('\n店名:',soup.span.string)
#获取餐厅名称
for i in soup.find_all(attrs={"class" : "field-name"}):
#psoup=BeautifulSoup(str(i),"html.parser")
#self.ctname.append(psoup.div.string)
try:
#必须print打印,否则无法触发异常,导致报错程序停止
print(i)
psoup=BeautifulSoup(str(i),"html.parser")
self.ctname.append(psoup.div.string)
except:
self.ctname.append('')
pass
#print(self.users)
#获取餐厅地址
for i in soup.find_all(attrs={"class" : "field-addr"}):
psoup=BeautifulSoup(str(i),"html.parser")
self.ctaddr.append(psoup.div.string)
##获取餐厅URL
for i in soup.find_all(attrs={"target" : "_blank"}):
psoup=BeautifulSoup(str(i),"html.parser")
if psoup.a.string == None:
self.cturl.append(psoup.a.attrs['href'])
#print(psoup.a.attrs['href'])
#print(self.cturl)
return self.cturl,self.ctname,self.ctaddr
def get_ct_pinlun(self,htmlurl):
self.htmlurl=htmlurl
page = urllib.request.urlopen(self.htmlurl)
htmlline = page.read()
#soup=BeautifulSoup(htmlline,"html.parser",from_encoding="UTF-8")
soup=BeautifulSoup(htmlline,"html.parser")
self.names=soup.span.string
print('\n店名:',soup.span.string)
for i in soup.find_all(attrs={"class" : "name","rel":"nofollow"}):
psoup=BeautifulSoup(str(i),"html.parser")
self.users.append(psoup.a.string)
#print(self.users)
for i in soup.find_all("span",{"class" : "time"}):
tmsoup=BeautifulSoup(str(i),"html.parser",exclude_encodings="UTF-8").span.string
# #tmsoup.span.string
tmsjoin=''.join(str(tmsoup).split('\xa0\xa0'))
self.tms.append(tmsjoin)
sps=soup.findAll("p",{"class" : "desc"})
for i in sps:
strs=str(i).split()
try:
dts=strs[1].split('>')[1:][0].split('<')[0]
self.datas.append(dts)
except:
#print('F',i)
continue
#print('--',self.datas)
return self.names,self.htmlurl,self.users,self.datas,self.tms
def get_ct_info(self,htmlurl):
self.htmlurl=htmlurl
headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders = [headers]
htmlline = opener.open(self.htmlurl).read()
#page = urllib.request.urlopen(self.htmlurl)
#htmlline = page.read()
#soup=BeautifulSoup(htmlline,"html.parser",from_encoding="UTF-8")
soup=BeautifulSoup(htmlline,"html.parser")
#获取餐厅名称
names=soup.title.string.split('电话')[0]
#print('\n店名:',names)
#获取地址
addrs=soup.find_all(attrs={"class" : "item","itemprop":"street-address"})
ap=BeautifulSoup(str(addrs),"html.parser")
addrs=ap.span.string.split()[0]
#print(ap.span.string.split()[0])
#获取电话
phone=soup.find_all(attrs={"class" : "item","itemprop":"tel"})
pp=BeautifulSoup(str(phone),"html.parser")
phones=pp.span.string.split()[0]
#print(pp.span.string.split()[0])
return names,phones,addrs
def run(self,htmlurl):
#dianping().get_html_test(htmlurl)
#print('--------------------')
cturl,ctname,ctaddr=dianping().get_ct_url(htmlurl)
#mysql=Mysqls()
n=1
for u in ctname[1:]:
try:
print(htmlurl,cturl[n],u,ctaddr[n])
names,addrs,phones=dianping().get_ct_info(cturl[n])
print(names,addrs,phones)
#sqls="insert into tongji_user_pinglun (ctid,ctname,ctarea,source_url,username,content,cttms) values(%s,'%s','%s','%s','%s','%s','%s');"
#mysql.cmd(sqls%(ctid,names,ctarea,htmlurl,u,datas[n],tms[n]))
#mysql.commit()
except:
print('F',u)
n=n+1
time.sleep(1)
#mysql.close()
##==============================================================================================================
if __name__ =="__main__":
url='http://dpindex.dianping.com/dpindex?type=rank&p='
for i in range(1,51):
print(url+str(i))
dianping().run('http://dpindex.dianping.com/dpindex?type=rank&p=1')
#dianping().get_ct_info('http://www.dianping.com/shop/4708533')
pass