Python 获取公开信息

这段代码主要展示了如何使用Python进行网络请求和数据解析,以抓取大众点评网站上的店铺信息。首先,通过分解区域和店铺类型,确保分页不超过50页。然后,处理加密字体,将抓取到的基本信息存储到数据库。最后,通过接口获取详细信息,如评分和地址,并更新数据库。
摘要由CSDN通过智能技术生成

       大众点评店铺页面最大分页数是50页,要抓取信息就是通过区域、店铺类型分解到最小达到尽可能全的抓取。以成都餐饮为例,每种分类先取到最小,区域范围依次从成都到区县到街道,如果大区域该分类小于50页就可以抓取,否则继续分解。

       大众的页面有时候有加密,是通过把数据字体设置为它独有格式来实现,下载对应字体对应转码即可,有时候没有加密就可以跳过不管。

        首先把数据根据地区和类型分解到小于50页并存在数据库,然后一页页抓取基本信息,最后通过观察的接口获取详细信息如详细地址、经纬度、各项评分、评价数等。

# -*- coding: utf-8 -*-
import json
import requests
import pymysql
import time
from fontTools.ttLib import TTFont


def woff_dict(key):
        if key == 'address':
            woff = TTFont('C:\\Users\\Administrator\\Desktop\\address.woff') # 读取woff文件
        elif key == 'num':
            woff = TTFont('C:\\Users\\Administrator\\Desktop\\num.woff') # 读取woff文件
        # woff文件中ID编号为2~602的601个字符
        woff_str_601 = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下臬凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只现朋候样直而买于般豆量选奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍跟块调糕'
        # ['cmap']为字符与Unicode编码的映射关系列表
        woff_unicode = woff['cmap'].tables[0].ttFont.getGlyphOrder()  # 获取603个字符对应的unicode编码
        woff_character = ['.notdef', 'x'] + list(woff_str_601) # 添加编号为0、1的两个特殊字符
        woff_dict = dict(zip(woff_unicode, woff_character))
        return woff_dict

def decodestr(firststr):
    strlist = firststr.split("<")
    laststr = ""
    for single in strlist:
        single = single.replace("/d>","").replace("/e>","")
        if single.find("address")>0:
            single = single[-5:-1]
            laststr += addressdict[single]
            #print(addressdict[single])
        elif single.find("num")>0:
            single = single[-5:-1]
            #print(numdict[single])
            laststr += numdict[single]
        elif single !="":
            laststr += single
    return laststr

#根据链接获取当前条件下结果的页数
def getpagecount(URLstr,countryname):
        try:
            res = requests.get(URLstr,headers=headers).text
        except:
            time.sleep(120)
            return getpagecount(URLstr,countryname)
        #如果抓取被限制,休眠后重新抓取
        if res.find("403 Forbidden")>0:
            time.sleep(60)
            print(URLstr+"  "+"403 forbidden   "+countryname)
            return getpagecount(URLstr,countryname)
        #当分页栏不存在说明只有一页
        if res.find("没有找到符合条件的商户")>0:
            pageCount = 0
        elif res.find("div class=\"page\"")<0:
            pageCount = 1
            print(URLstr+" "+"1页   "+countryname)
        else:
            pagestr = res[res.find("div class=\"page\""):]
            pagestr = pagestr[:pagestr.find("</div>")].replace("title=\"下一页\">下一页","")
            pagestr = pagestr.split("</a>")
            pagestr.reverse()
            for page in pagestr:
                if page.find("title=\"")>0:
                    pageCount = page[page.find("title=\"")+7:]
                    pageCount = pageCount[:pageCount.find("\"")]
                    print(URLstr+" "+pageCount+"页  "+countryname)
                    pageCount = (int)(pageCount) 
                    break
        return pageCount

if __name__ == '__main__':
    woffnum = (str)(woff_dict('num')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
    woffaddress = (str)(woff_dict('address')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
    numdict = {}
    newdict = woffnum.split(",")
    for d in newdict:
            d = '{' + d + '}'
            d = eval(d)
            numdict.update(d)

    addressdict = {}
    newdict = woffaddress.split(",")
    for d in newdict:
            d = '{' + d + '}'
            d = eval(d)
            addressdict.update(d)
    
    baseURL =  "https://www.dianping.com/chengdu/ch10"
    requeststr1 = baseURL
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
        "Cookie" : "自己的cookie",
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
    }

     #打开数据库连接
    conn = pymysql.connect(host = 'localhost',user = "root",passwd = "自己的密码",db = "大众点评")
    cur=conn.cursor()
    querysql = "SELECT mainParentCategoryId,pageCount,countryid,url,islast FROM dazhong_paging_restaurant"
    cur.execute(querysql)
    if cur.rowcount<1:
        print("需要初始化分页数据库")
    else:
        lists = cur.fetchall()
        for list in lists:
            mainParentCategoryId = list[0]
            pageCount = list[1]
            countryid = list[2]
            URLstr = list[3]
            islast = list[4]
            #超过五十页则继续细分
            if pageCount==None or (pageCount==50 and islast!=1): 
                            #根据链接获取当前分页的页数               
                            pageCount = getpagecount(URLstr,"")
                            if pageCount==0:
                                continue
                            #如果在50页内,更新数据库
                            if pageCount<50:
                                insertSQLStrings="REPLACE INTO `大众点评`.`dazhong_paging_restaurant`(`mainParentCategoryId`, `pageCount`, `countryid`, `url`) VALUES ({},{},{},'{}')".format(mainParentCategoryId,pageCount,countryid,URLstr)
                                cur.execute(insertSQLStrings)
                            #如果大于50页,继续细分到各区市县,如果区市县还大于50页,最低细分到街道
                            else:
                                querysql = "SELECT countryid,countryname,parentid FROM chengduareacode WHERE parentid = {}".format(countryid)
                                cur.execute(querysql)
                                #如果已经到最细分层级还大于50页,只能记录在数据库中
                                if cur.rowcount<1:
                                    insertSQLStrings="REPLACE INTO `大众点评`.`dazhong_paging_restaurant`(`mainParentCategoryId`, `pageCount`, `countryid`, `url`,`islast`) VALUES ({},50,{},'{}',1)".format(mainParentCategoryId,countryid,URLstr)
                                    cur.execute(insertSQLStrings)
                                else:
                                    countryids = cur.fetchall()
                                    for countryid in countryids:
                                        time.sleep(11)
                                        countryname = countryid[1]
                                        countryid = countryid[0]
                                        if countryid in (10,35,36,37,38,39,4956):
                                            URLstrnew = URLstr+"r"+(str)(countryid)
                                        else:
                                            URLstrnew = URLstr+"c"+(str)(countryid)
                                        pageCount = getpagecount(URLstrnew,countryname)
                                        if pageCount==0:
                                            continue
                                        insertSQLString1="DELETE from `大众点评`.`dazhong_paging_restaurant` where url='{}'".format(URLstr)
                                        cur.execute(insertSQLString1)
                                        if pageCount<50:
                                            #细分到区市县后,先删除市级条目,再把区市县条目写入
                                            insertSQLString2="REPLACE INTO `大众点评`.`dazhong_paging_restaurant`(`mainParentCategoryId`, `pageCount`, `countryid`, `url`) VALUES ({},{},{},'{}')".format(mainParentCategoryId,pageCount,countryid,URLstrnew)
                                            cur.execute(insertSQLString2)
                                            URLstrnew = URLstr
                                        else:
                                            #继续细化到街道
                                            querysql = "SELECT countryid,countryname,parentid FROM chengduareacode WHERE parentid = {}".format(countryid)
                                            cur.execute(querysql)
                                            #如果已经到最细分层级还大于50页,只能记录在数据库中
                                            if cur.rowcount<1:
                                                insertSQLStrings="REPLACE INTO `大众点评`.`dazhong_paging_restaurant`(`mainParentCategoryId`, `pageCount`, `countryid`, `url`,`islast`) VALUES ({},50,{},'{}',1)".format(mainParentCategoryId,countryid,URLstr)
                                                cur.execute(insertSQLStrings)
                                            else:
                                                countryids = cur.fetchall()
                                                for countryid in countryids:
                                                    time.sleep(11)
                                                    countryid = countryid[0]
                                                    URLstrnew = URLstr+"r"+(str)(countryid)
                                                    pageCount = getpagecount(URLstrnew,"")
                                                    if pageCount==0:
                                                        continue
                                                    if pageCount<50:
                                                        #细分到街道后,先删除区市县条目,再把街道条目写入
                                                        #insertSQLString1="DELETE from `大众点评`.`dazhong_paging_restaurant` where url='{}'".format(URLstr)
                                                        insertSQLString2="REPLACE INTO `大众点评`.`dazhong_paging_restaurant`(`mainParentCategoryId`, `pageCount`, `countryid`, `url`) VALUES ({},{},{},'{}')".format(mainParentCategoryId,pageCount,countryid,URLstrnew)
                                                        #cur.execute(insertSQLString1)
                                                        cur.execute(insertSQLString2)
                                                        URLstrnew = URLstr
                                                    #如果已经到最细分层级还大于50页,只能记录在数据库中                                                        
                                                    elif pageCount==50:
                                                        insertSQLStrings="REPLACE INTO `大众点评`.`dazhong_paging_restaurant`(`mainParentCategoryId`, `pageCount`, `countryid`, `url`,`islast`) VALUES ({},50,{},'{}',1)".format(mainParentCategoryId,countryid,URLstrnew)
                                                        cur.execute(insertSQLStrings)
                                                        URLstrnew = URLstr
                                                        print("最小限度划分已满50页:")
                                                        print(insertSQLStrings)
                                        conn.commit()
                            conn.commit()


                                
                            
                

                

   

这一步完成后,根据这些分好类的连接抓取基本信息

# -*- coding: utf-8 -*-
import json
import requests
from fontTools.ttLib import TTFont
import pymysql
import time


def woff_dict(key):
        if key == 'address':
            woff = TTFont('C:\\Users\\Administrator\\Desktop\\address.woff') # 读取woff文件
        elif key == 'num':
            woff = TTFont('C:\\Users\\Administrator\\Desktop\\num.woff') # 读取woff文件
        # woff文件中ID编号为2~602的601个字符
        woff_str_601 = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下臬凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只现朋候样直而买于般豆量选奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍跟块调糕'
        # ['cmap']为字符与Unicode编码的映射关系列表
        woff_unicode = woff['cmap'].tables[0].ttFont.getGlyphOrder()  # 获取603个字符对应的unicode编码
        woff_character = ['.notdef', 'x'] + list(woff_str_601) # 添加编号为0、1的两个特殊字符
        woff_dict = dict(zip(woff_unicode, woff_character))
        return woff_dict

def decodestr(firststr):
    strlist = firststr.split("<")
    laststr = ""
    for single in strlist:
        single = single.replace("/d>","").replace("/e>","")
        if single.find("address")>0:
            single = single[-5:-1]
            laststr += addressdict[single]
            #print(addressdict[single])
        elif single.find("num")>0:
            single = single[-5:-1]
            #print(numdict[single])
            laststr += numdict[single]
        elif single !="":
            laststr += single
    return laststr

if __name__ == '__main__':
    woffnum = (str)(woff_dict('num')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
    woffaddress = (str)(woff_dict('address')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
    numdict = {}
    newdict = woffnum.split(",")
    for d in newdict:
            d = '{' + d + '}'
            d = eval(d)
            numdict.update(d)

    addressdict = {}
    newdict = woffaddress.split(",")
    for d in newdict:
            d = '{' + d + '}'
            d = eval(d)
            addressdict.update(d)

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
        "Cookie" : "自己的",
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
    }

    conn = pymysql.connect(host = 'localhost',user = "root",passwd = "自己的",db = "大众点评")
    cur=conn.cursor()
    querysql = "SELECT url,hasGet,finish FROM dazhong_paging_restaurant"
    cur.execute(querysql)
    lists = cur.fetchall()
    for list in lists:
        url = list[0]
        hasGet = list[1]
        finish = list[2]
        if hasGet==None:
            hasGet=0             
        hasGet += 1       
        if finish!=1:
            url += "o3p"
            for i in range(1,51):
                if hasGet>i:
                    print("已抓取,跳过该页:"+(str)(i))
                    continue
                urlnew = url+(str)(i)
                requeststr0 = urlnew
                try:
                    res = requests.get(requeststr0,headers=headers,timeout=100).text
                except:
                    time.sleep(80)
                    res = requests.get(requeststr0,headers=headers,timeout=100).text
                #如果抓取被限制,休眠后重新抓取
                if res.find("403 Forbidden")>0:
                    print("403访问被限制,已退出")
                    exit()
                #如果页数到尽头 就退出该分类
                if res.find("没有找到符合条件的商户")>0:
                    break
                res = res[res.find("shop-all-list"):res.find("商户没有被收录")]
                res = res.split("<li class=\"\" >")
                for re in res:
                    if len(re)<50:
                        continue
                    shopid = re[re.find("data-shopid=\"")+13:]
                    shopid = shopid[:shopid.find("\"")]
                    shopAllname = re[re.find("<h4>")+4:re.find("</h4>")].replace("'","\\'")
                    if re.find("https://www.dianping.com/brands/")>0:
                        shopGroupId = re[re.find("https://www.dianping.com/brands/")+32:re.find("\" module=\"list-branch\"")]
                    else:
                        shopGroupId = ""
                    if re.find("我要评价")>0:
                        defaultReviewCount = 0
                    else:
                        defaultReviewCount = re[re.find("<b>")+3:re.find("</b>")]
                    avgPrice = re[re.find("人均"):]
                    if avgPrice.find("-")==13:
                        avgPrice=0
                    else:
                        avgPrice = avgPrice[avgPrice.find("<b>")+4:avgPrice.find("</b>")]
                    if re.find("istopTrade")>0:
                        status = re[re.find("istopTrade")+12:]
                        status = status[:status.find("</span>")]
                    else:
                        status=""
                    countryAndtype = re[re.find("tag-addr"):]
                    mainParentCategoryId = countryAndtype[countryAndtype.find("/g")+2:countryAndtype.find("\" data-click-name")]
                    categoryName = countryAndtype[countryAndtype.find("class=\"tag\">")+12:countryAndtype.find("</span>")]
                    countryAndtype = countryAndtype[countryAndtype.find("\"sep\""):]
                    countryid = countryAndtype[countryAndtype.find("/r")+2:countryAndtype.find("\" data-click-name")]
                    countryname = countryAndtype[countryAndtype.find("class=\"tag\">")+12:countryAndtype.find("</span>")]
                    if countryid.find("|")>0:
                        print("该店铺信息异常被跳过:"+shopid)
                        continue
                    if re.find("class=\"recommend\"")>0: 
                        recommendstr = re[re.find("class=\"recommend\"")+16:]
                        recommendstr = recommendstr[:recommendstr.find("</div>")]
                        recommendstr = recommendstr.split("\">")
                        recommend = ""
                        for recommendtemp in recommendstr:
                            if recommendtemp.find("</a>")>0:                            
                                recommendtemp = recommendtemp[:recommendtemp.find("</a>")]
                                recommend = recommend+recommendtemp+" "
                    else:
                        recommend = ""                                                        
                    print(shopid+" "+shopAllname+" "+shopGroupId+" "+(str)(defaultReviewCount)+" "+(str)(avgPrice)+" "+mainParentCategoryId+" "+categoryName+" "+countryid+" "+countryname+" "+status+" "+recommend)
                    insertSQLStrings="REPLACE INTO `大众点评`.`shopdetail_restaurant`(`shopid`, `shopAllname`, `shopGroupId`, `defaultReviewCount`,`avgPrice`,`mainParentCategoryId`,`categoryName`,`countryid`,`countryname`,`status`,`recommend`) VALUES ('{}','{}','{}',{},{},{},'{}',{},'{}','{}','{}')".format(shopid, shopAllname, shopGroupId, defaultReviewCount,avgPrice,mainParentCategoryId,categoryName,countryid,countryname,status,recommend)
                    cur.execute(insertSQLStrings)
                print("第"+(str)(i)+"页已抓取")
                updatesql1 = "UPDATE dazhong_paging_restaurant SET hasGet={} WHERE url='{}'".format(i,list[0])
                cur.execute(updatesql1)
                conn.commit()
                time.sleep(15)
            updatesql2 = "UPDATE dazhong_paging_restaurant SET finish=1 WHERE url='{}'".format(list[0])
            cur.execute(updatesql2)
            conn.commit()



        

最后通过接口获取更多详细丰富信息

# -*- coding: utf-8 -*-
import json
import requests
from fontTools.ttLib import TTFont
import pymysql
import time


def woff_dict(key):
        if key == 'address':
            woff = TTFont('C:\\Users\\Administrator\\Desktop\\address.woff') # 读取woff文件
        elif key == 'num':
            woff = TTFont('C:\\Users\\Administrator\\Desktop\\num.woff') # 读取woff文件
        # woff文件中ID编号为2~602的601个字符
        woff_str_601 = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下臬凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只现朋候样直而买于般豆量选奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍跟块调糕'
        # ['cmap']为字符与Unicode编码的映射关系列表
        woff_unicode = woff['cmap'].tables[0].ttFont.getGlyphOrder()  # 获取603个字符对应的unicode编码
        woff_character = ['.notdef', 'x'] + list(woff_str_601) # 添加编号为0、1的两个特殊字符
        woff_dict = dict(zip(woff_unicode, woff_character))
        return woff_dict

def decodestr(firststr):
    strlist = firststr.split("<")
    laststr = ""
    for single in strlist:
        single = single.replace("/d>","").replace("/e>","")
        if single.find("address")>0:
            single = single[-5:-1]
            laststr += addressdict[single]
            #print(addressdict[single])
        elif single.find("num")>0:
            single = single[-5:-1]
            #print(numdict[single])
            laststr += numdict[single]
        elif single !="":
            laststr += single
    return laststr

if __name__ == '__main__':
    woffnum = (str)(woff_dict('num')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
    woffaddress = (str)(woff_dict('address')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
    numdict = {}
    newdict = woffnum.split(",")
    for d in newdict:
            d = '{' + d + '}'
            d = eval(d)
            numdict.update(d)

    addressdict = {}
    newdict = woffaddress.split(",")
    for d in newdict:
            d = '{' + d + '}'
            d = eval(d)
            addressdict.update(d)

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
        #"Cookie" : "自己的",
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
    }

    conn = pymysql.connect(host = 'localhost',user = "root",passwd = "自己的",db = "大众点评")
    cur=conn.cursor()
    querysql = "SELECT shopid FROM shopdetail_restaurant where fivescore is NULL"
    cur.execute(querysql)
    lists = cur.fetchall()
    for list in lists:
        shopid = list[0]
        requeststr1 = "https://www.dianping.com/ajax/json/shopDynamic/reviewAndStar?shopId={}&cityId=1&mainCategoryId=10".format(shopid)
        requeststr2 = "https://www.dianping.com/ajax/json/shopDynamic/basicHideInfo?shopId="+shopid
        requeststr3 = "https://www.dianping.com/ajax/json/shopDynamic/shopAside?shopId="+shopid
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
        }
        res = requests.get(requeststr1,headers=headers).json()
        avgPrice = decodestr(res['avgPrice'])
        defaultReviewCount = decodestr(res['defaultReviewCount'])
        try:
            fivescore = res['fiveScore']
        except:
            fivescore = '-'
        if fivescore=="-":
            fivescore=0
        scoreTaste = decodestr(res['shopRefinedScoreValueList'][0])
        if scoreTaste=="-":
            scoreTaste=0
        scoreEnvironment = decodestr(res['shopRefinedScoreValueList'][1])
        if scoreEnvironment=="-":
            scoreEnvironment=0
        scoreService = decodestr(res['shopRefinedScoreValueList'][2])
        if scoreService=="-":
            scoreService=0
        res = requests.get(requeststr2,headers=headers).json()
        shopName = res['msg']['shopInfo']['shopName'].replace("'","\\'")
        branchName = res['msg']['shopInfo']['branchName']
        address = decodestr(res['msg']['shopInfo']['address']).replace("'","\\'")
        phoneNo = decodestr(res['msg']['shopInfo']['phoneNo'])
        shopGroupId = res['msg']['shopInfo']['shopGroupId']
        if shopGroupId==shopid:
            shopGroupId=""
        res = requests.get(requeststr3,headers=headers).json()
        glat = res['shop']['glat']
        glng = res['shop']['glng']
        categoryName = res['category']['categoryName']
        #enl = res['category']['mainParentCategoryId']
        if branchName==None:
            branchName=""
        #print(avgPrice+" "+defaultReviewCount+" "+fivescore+" "+scoreTaste+" "+scoreEnvironment+" "+scoreService+" "+shopName+" "+branchName+" "+address+" "+phoneNo+" "+shopGroupId+" "+(str)(glat)+" "+(str)(glng)+" "+categoryName+" "+(str)(mainParentCategoryId))
        print(avgPrice+" "+defaultReviewCount+" "+(str)(fivescore)+" "+(str)(scoreTaste)+" "+(str)(scoreEnvironment)+" "+(str)(scoreService)+" "+shopName+" "+branchName+" "+address+" "+phoneNo+" "+shopGroupId+" "+(str)(glat)+" "+(str)(glng)+" "+categoryName)
        insertSQLStrings="update `大众点评`.`shopdetail_restaurant` SET `fivescore` = {},`scoreTaste`={},`scoreEnvironment`={},`scoreService`={},`avgPrice`={},`defaultReviewCount`={},`shopName`='{}',`branchName`='{}',`address`='{}',`phoneNo`='{}',`shopGroupId`='{}',`glat`={},`glng`={} WHERE shopid = '{}'".format(fivescore, scoreTaste, scoreEnvironment,scoreService,avgPrice,defaultReviewCount,shopName,branchName,address,phoneNo,shopGroupId,glat,glng,shopid)
        #print(insertSQLStrings)
        cur.execute(insertSQLStrings)
        conn.commit()
        time.sleep(2)
        #exit()

    
            
    

        

最后结束如下

Python是一种通用高级编程语言,可以用于执行各种任务,包括信息公开网的执行。信息公开网通常是指由政府或组织为公众提供的信息平台。在Python中,有多种方法可以执行信息公开网,如下: 1. 网络爬虫:通过使用Python的网络爬虫库(例如BeautifulSoup或Scrapy),我们可以编写代码来自动访问信息公开网站,并提取所需的信息。我们可以通过指定URL和处理HTML页面来获取网站上的数据和内容,然后按照我们的需求进行处理和解析。 2. 数据处理和分析:Python提供了强大的数据处理和分析库,如Pandas和NumPy。我们可以使用这些库来处理从信息公开网站获取的数据。我们可以对数据进行清洗、整理和分析,并生成有用的统计结果和可视化图表。 3. 数据存储和管理:Python可以与各种数据库进行集成,如MySQL、SQLite和MongoDB等。我们可以使用Python连接到数据库,并将从信息公开获取的数据存储在数据库中。这样做的好处是可以随时访问和查询存储的数据,并进行后续的数据分析和处理。 4. 自动化和定时任务:Python可以编写自动化脚本来执行信息公开网的任务。我们可以使用Python的定时任务库(如APScheduler)来定期执行获取数据的任务,并将数据保存到指定的位置。这样我们就可以在每次执行任务时自动更新和获取最新的信息。 总之,Python是一种非常适合执行信息公开网任务的编程语言。它提供了丰富的库和工具,可以帮助我们实现网站数据的获取、处理和分析,并自动化执行各种任务。无论是简单的数据提取还是复杂的数据分析,Python都能帮助我们高效地完成任务。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

yyqfyyqf

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值