bs4抓起大众点评的用户评论

抓起大众点评的用户评论

 #encoding='UTF-8'
__author__ = 'Administrator'
import sys,urllib.request
sys.path.append('./')
import sql
#import re,time
import time
from bs4 import BeautifulSoup
#------------------------------------------------------
def Mysqls():
    return  sql.Mysql('127.0.0.1','root','123456','test_msccms')
#------------------------------------------------------
class dianping:
    def __init__(self):
        self.names=''
        self.cturl=[]
        self.ctname=[]
        self.ctaddr=[]
        self.users=[]
        self.datas=[]
        self.tms=[]

    def get_ct_url(self,htmlurl):
        self.htmlurl=htmlurl
        headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36')
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        htmlline = opener.open(self.htmlurl).read()
        #page=urllib.request.urlopen(self.htmlurl)
        #htmlline = page.read()
        #soup=BeautifulSoup(htmlline,"html.parser",from_encoding="UTF-8")
        soup=BeautifulSoup(htmlline,"html.parser")
        self.names=soup.span.string

        print('\n店名:',soup.span.string)
        #获取餐厅名称
        for i in  soup.find_all(attrs={"class" : "field-name"}):
            #psoup=BeautifulSoup(str(i),"html.parser")
            #self.ctname.append(psoup.div.string)
            try:
                #必须print打印,否则无法触发异常,导致报错程序停止
                print(i)
                psoup=BeautifulSoup(str(i),"html.parser")
                self.ctname.append(psoup.div.string)
            except:
                self.ctname.append('')
                pass
        #print(self.users)
        #获取餐厅地址
        for i in  soup.find_all(attrs={"class" : "field-addr"}):
            psoup=BeautifulSoup(str(i),"html.parser")
            self.ctaddr.append(psoup.div.string)

        ##获取餐厅URL
        for i in  soup.find_all(attrs={"target" : "_blank"}):
            psoup=BeautifulSoup(str(i),"html.parser")
            if psoup.a.string == None:
                self.cturl.append(psoup.a.attrs['href'])
                #print(psoup.a.attrs['href'])
        #print(self.cturl)
        return self.cturl,self.ctname,self.ctaddr

    def get_ct_pinlun(self,htmlurl):
        self.htmlurl=htmlurl
        page = urllib.request.urlopen(self.htmlurl)
        htmlline = page.read()
        #soup=BeautifulSoup(htmlline,"html.parser",from_encoding="UTF-8")
        soup=BeautifulSoup(htmlline,"html.parser")
        self.names=soup.span.string


        print('\n店名:',soup.span.string)
        for i in  soup.find_all(attrs={"class" : "name","rel":"nofollow"}):
            psoup=BeautifulSoup(str(i),"html.parser")
            self.users.append(psoup.a.string)
        #print(self.users)

        for i in soup.find_all("span",{"class" : "time"}):
            tmsoup=BeautifulSoup(str(i),"html.parser",exclude_encodings="UTF-8").span.string
        #    #tmsoup.span.string
            tmsjoin=''.join(str(tmsoup).split('\xa0\xa0'))
            self.tms.append(tmsjoin)

        sps=soup.findAll("p",{"class" : "desc"})
        for i in  sps:
            strs=str(i).split()
            try:
                dts=strs[1].split('>')[1:][0].split('<')[0]
                self.datas.append(dts)
            except:
                #print('F',i)
                continue
        #print('--',self.datas)
        return self.names,self.htmlurl,self.users,self.datas,self.tms

    def get_ct_info(self,htmlurl):
        self.htmlurl=htmlurl
        headers = ('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.93 Safari/537.36')
        opener = urllib.request.build_opener()
        opener.addheaders = [headers]
        htmlline = opener.open(self.htmlurl).read()
        #page = urllib.request.urlopen(self.htmlurl)
        #htmlline = page.read()
        #soup=BeautifulSoup(htmlline,"html.parser",from_encoding="UTF-8")
        soup=BeautifulSoup(htmlline,"html.parser")

        #获取餐厅名称
        names=soup.title.string.split('电话')[0]
        #print('\n店名:',names)
        #获取地址
        addrs=soup.find_all(attrs={"class" : "item","itemprop":"street-address"})
        ap=BeautifulSoup(str(addrs),"html.parser")
        addrs=ap.span.string.split()[0]
        #print(ap.span.string.split()[0])
        #获取电话
        phone=soup.find_all(attrs={"class" : "item","itemprop":"tel"})
        pp=BeautifulSoup(str(phone),"html.parser")
        phones=pp.span.string.split()[0]
        #print(pp.span.string.split()[0])
        return names,phones,addrs

    def run(self,htmlurl):
        #dianping().get_html_test(htmlurl)
        #print('--------------------')
        cturl,ctname,ctaddr=dianping().get_ct_url(htmlurl)
        #mysql=Mysqls()
        n=1
        for u in ctname[1:]:
            try:
                print(htmlurl,cturl[n],u,ctaddr[n])
                names,addrs,phones=dianping().get_ct_info(cturl[n])
                print(names,addrs,phones)
                #sqls="insert into  tongji_user_pinglun (ctid,ctname,ctarea,source_url,username,content,cttms) values(%s,'%s','%s','%s','%s','%s','%s');"
                #mysql.cmd(sqls%(ctid,names,ctarea,htmlurl,u,datas[n],tms[n]))
                #mysql.commit()
            except:
                print('F',u)
            n=n+1
            time.sleep(1)
        #mysql.close()

##==============================================================================================================

if __name__ =="__main__":
    url='http://dpindex.dianping.com/dpindex?type=rank&p='
    for i in range(1,51):
        print(url+str(i))

    dianping().run('http://dpindex.dianping.com/dpindex?type=rank&p=1')
    #dianping().get_ct_info('http://www.dianping.com/shop/4708533')
    pass


转载于:https://my.oschina.net/jk409/blog/659108

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值