大众点评实战解决反爬

前面讲了大众点评的反爬解决措施 今天就实际来一次
做一个大众点评所有商铺的名字 价格 评价 地址 推荐菜的爬虫
我在前面试过爬全网的 在几千条之后就会限制你再登大众点评 所以不用代理ip的话会被限制ip 所以这里我限制了自己的数量 爬的是一个类别下的50页里面的一页 也就是15条数据

代码比较粗糙 没有去优化 见谅
这些代码就是爬取了一个类别的50条url 如果想爬取所有类别的50条url的话 只需要传入每一个类别的url 而每个的url好像需要构造而已

from fontTools.ttLib import TTFont
from lxml import etree
import requests

import  re
import time
font=TTFont('num.woff')
# font.saveXML('shuzi.xml')

# # font_name=font.getGlyphOrder()
# # print(font_name)
# print(font.getBestCmap())
# print(font.getReverseGlyphMap())


url="http://www.dianping.com/chongqing/ch10/g110"
headers={
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding":"gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Referer" : "http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/c8452b46b93efe6c8d71f25cbf3fdcf7.css",
    "User-Agent" : "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36",
    "Cookie": "fspop=student; _lxsdk_cuid=17526568150c8-057f2e4e9b0bd1-3323767-151800-17526568150c8; _lxsdk=17526568150c8-057f2e4e9b0bd1-3323767-151800-17526568150c8; _hc.v=f89b5402-1920-6473-8ea3-fe6140109f50.1602666007; s_ViewType=10; _dp.ac.v=482bf0af-d2c4-4521-aa86-05e03cf05d01; ua=%E5%8C%85%E9%9D%92%E5%A4%A9_8258; ctu=092729311bdbf5b249043bc590d0ca4370c52e82ad59fb7de1fdf41599181d0a; aburl=1; cityInfo=%7B%22cityId%22%3A9%2C%22cityName%22%3A%22%E9%87%8D%E5%BA%86%22%2C%22provinceId%22%3A0%2C%22parentCityId%22%3A0%2C%22cityOrderId%22%3A0%2C%22isActiveCity%22%3Afalse%2C%22cityEnName%22%3A%22chongqing%22%2C%22cityPyName%22%3Anull%2C%22cityAreaCode%22%3Anull%2C%22cityAbbrCode%22%3Anull%2C%22isOverseasCity%22%3Afalse%2C%22isScenery%22%3Afalse%2C%22TuanGouFlag%22%3A0%2C%22cityLevel%22%3A0%2C%22appHotLevel%22%3A0%2C%22gLat%22%3A0%2C%22gLng%22%3A0%2C%22directURL%22%3Anull%2C%22standardEnName%22%3Anull%7D; cy=9; cye=chongqing; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1602766315,1602768257,1602853468,1602896940; dper=d2eae47993f8001491f6ff1594faa3b4fb7b929b9b0cfbfc96138e514da123f235eca63223dcd5dbbc8fb1e7a4d8dca8ccb7fb8dea90f404a9da62da0e4314237178bba721e33b13d28c560db470a60cd3841932e1f836250c4052606f36878d; ll=7fd06e815b796be3df069dec7836c3df; uamo=15978973646; dplet=3685a688f2538b66c64c3f6a68877c19; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1602898122; _lxsdk_s=175341a153c-b70-254-d0f%7C%7C149",
    "Host": "www.dianping.com",
    "Upgrade-Insecure-Requests": "1"

}
res=requests.get(url=url,headers=headers)


all_pages_url=[]

def get_all_urls(url):#     拿每一个分类下的所有url  #'http://www.dianping.com/chongqing/ch10/g112p1', 'http://www.dianping.com/chongqing/ch10/g110p1?cpt=H3VtHdt12CaP5fiM%2CG5rtdCI9k29EHWJm%2CFpqK3jKu212fJXZq%2CG1gZriVBpOpnbzVY%2Ck7DJ0ugHVO6EuXIP%2Ck3Q4FCbe8mcm1wxL', 'http://www.dianping.com/chongqing/ch10/g111p1', 'http://www.di
    print("每个类别的url拿到 现在要那这个类别下的所有url")
    for s in range(2):   #第一个url不动
        if s==0:
            print("正在拿第%d业"%(s+1),url)
            res = requests.get(url=url, headers=headers)
            html_res = etree.HTML(res.text)
            all_pages_url.append(url)
            page_url_list = html_res.xpath('//div[@class="shop-list J_shop-list shop-all-list"]//div[@class="txt"]/div[1]/a[1]/@href')
            # for single_url in page_url:
            #     print(single_url)
            #     print(len(page_url))
                # get_detil(single_url)
        else:
            for i in range(1, 51):
                url=url.replace('p%d'%i,'p%d'%(i+1))#p1
                print("已经拿到数据 再拿第%d业"%i,url)
                proxies={
                    'http':'192.168.31.36',
                    'https':'192.168.31.36'
                }
                all_pages_url.append(url)
                res = requests.get(url=url, headers=headers)
                print(res.status_code)
                html_res=etree.HTML(res.text)
                # page_url_list = html_res.xpath('//div[@class="shop-list J_shop-list shop-all-list"]//div[@class="txt"]/div[1]/a[1]/@href')
                # for single_url in page_url:
                #    print(single_url)
                #    print(len(page_url))
                   # get_detil(single_url)
    # elif url=='http://www.dianping.com/chongqing/ch10/g110p1?cpt=H3VtHdt12CaP5fiM%2CG5rtdCI9k29EHWJm%2CFpqK3jKu212fJXZq%2CG1gZriVBpOpnbzVY%2Ck7DJ0ugHVO6EuXIP%2Ck3Q4FCbe8mcm1wxL':

    print(all_pages_url)
    with open('页面url.txt','w') as f:
        for i in all_pages_url:
            f.write(i)
            f.write('\r\n')

get_all_urls("http://www.dianping.com/chongqing/ch10/g111p1")

代码片

在这里插入图片描述
一共50个url 在使用的时候我们可以直接读取这个文件 但是我就直接拿了一条出来使用


from fake_useragent import UserAgent
import time
import requests
import time
import re

final_num={'glyph00000': 0, 'x': 1, 'unie9dd': 2, 'unif29c': 3, 'unif6c6': 4, 'unie603': 5, 'unie524': 6, 'unif735': 7, 'unie469': 8, 'uniee09': 9, 'unie642': 10, 'unif8a7': 11, 'unie5b0': 12, 'uniee4e': 13, 'unie77d': 14, 'unied56': 15, 'unif324': 16, 'unif1ae': 17, 'unie2ad': 18, 'uniece4': 19, 'unie92a': 20, 'unie8d3': 21, 'unie81a': 22, 'unieae3': 23, 'unie441': 24, 'unie699': 25, 'unif434': 26, 'unie6ac': 27, 'uniea06': 28, 'unie305': 29, 'unif499': 30, 'unie6a7': 31, 'unif279': 32, 'unie509': 33, 'unie6f2': 34, 'unied44': 35, 'unif077': 36, 'unif510': 37, 'unif545': 38, 'unif375': 39, 'unie1c6': 40, 'unif2ad': 41, 'unif37d': 42, 'uniedd8': 43, 'unie662': 44, 'unie260': 45, 'unie552': 46, 'unif4b4': 47, 'unie4b1': 48, 'unie235': 49, 'unif00d': 50, 'unieafe': 51, 'unie5d8': 52, 'unif691': 53, 'unif0bc': 54, 'unie21f': 55, 'unif792': 56, 'unie306': 57, 'unif4fb': 58, 'uniea17': 59, 'unie37d': 60, 'unif75d': 61, 'unie7ac': 62, 'unif2e6': 63, 'unie25c': 64, 'unif707': 65, 'unie2f6': 66, 'unif109': 67, 'unif00b': 68, 'uniec68': 69, 'unif0cf': 70, 'unie4ce': 71, 'unif16b': 72, 'unif0cd': 73, 'unie02d': 74, 'uniede2': 75, 'unie118': 76, 'unie88f': 77, 'unif422': 78, 'uniec8b': 79, 'uniec9e': 80, 'unie85a': 81, 'uniec4c': 82, 'unif58f': 83, 'unif3fe': 84, 'unif60f': 85, 'unie677': 86, 'unif844': 87, 'unif533': 88, 'unif329': 89, 'unif31e': 90, 'uniefaf': 91, 'unif134': 92, 'unif6a2': 93, 'unie5bb': 94, 'unie328': 95, 'unie26a': 96, 'unie72d': 97, 'uniefd9': 98, 'uniee5d': 99, 'unif6e3': 100, 'unif38f': 101, 'unif6e5': 102, 'unie80f': 103, 'unif1f8': 104, 'unif7cc': 105, 'unif096': 106, 'unie991': 107, 'unie7f0': 108, 'unif386': 109, 'uniefca': 110, 'unif037': 111, 'unie60e': 112, 'unif310': 113, 'unif816': 114, 'unie2a2': 115, 'unie22f': 116, 'unie2aa': 117, 'unif39c': 118, 'unie5d7': 119, 'unif246': 120, 'unie417': 121, 'unie91a': 122, 'unie9f8': 123, 'unif2bd': 124, 'unie4c0': 125, 'unif8e8': 126, 'unif6b4': 127, 'unie93c': 128, 'uniec6b': 129, 'unieeea': 130, 'unif2da': 131, 'unie205': 132, 'unif074': 133, 'unif426': 134, 'unie216': 135, 'unie787': 136, 'unif149': 137, 'unie411': 138, 'unif82f': 139, 'unif7e7': 140, 'uniefcb': 141, 'uniec93': 142, 'unie175': 143, 'unie698': 144, 'unif026': 145, 'unif5c3': 146, 'unied34': 147, 'unif052': 148, 'unie30c': 149, 'unie7b3': 150, 'unif3e6': 151, 'unie181': 152, 'unif845': 153, 'unie347': 154, 'unie3a0': 155, 'unie751': 156, 'uniea93': 157, 'unie63a': 158, 'unif6fe': 159, 'unie106': 160, 'unif497': 161, 'unie4d9': 162, 'unie38b': 163, 'unie995': 164, 'uniea91': 165, 'unif20f': 166, 'unie6c2': 167, 'unif609': 168, 'unie31f': 169, 'uniefbc': 170, 'unie53e': 171, 'uniedd3': 172, 'unif4e3': 173, 'unie6a0': 174, 'unif0f6': 175, 'unif81c': 176, 'unif527': 177, 'unif2c1': 178, 'unie93d': 179, 'unie79d': 180, 'unie0a8': 181, 'unif61b': 182, 'unie3db': 183, 'unie68a': 184, 'unie261': 185, 'unif79d': 186, 'unie32c': 187, 'unieba2': 188, 'unie1d5': 189, 'unie7e8': 190, 'uniecb5': 191, 'unie871': 192, 'unieb93': 193, 'unie944': 194, 'uniead0': 195, 'unif04c': 196, 'uniea96': 197, 'unif303': 198, 'unif51b': 199, 'unie26e': 200, 'unie990': 201, 'unif139': 202, 'unie61f': 203, 'unief65': 204, 'unie55f': 205, 'unied48': 206, 'unif7f7': 207, 'unif731': 208, 'unif4d2': 209, 'unie1fa': 210, 'uniec0a': 211, 'unif34a': 212, 'unif4ae': 213, 'unie12f': 214, 'uniee27': 215, 'unie4c9': 216, 'uniefc7': 217, 'unif440': 218, 'unie4ef': 219, 'unieae2': 220, 'unif4c7': 221, 'unif7d4': 222, 'unif581': 223, 'unif34f': 224, 'unif732': 225, 'unif11d': 226, 'unie870': 227, 'unie9a8': 228, 'unif445': 229, 'uniebe9': 230, 'unie66e': 231, 'unif798': 232, 'unief2a': 233, 'unif403': 234, 'unie873': 235, 'unif540': 236, 'unie206': 237, 'unif68f': 238, 'unif298': 239, 'unie136': 240, 'uniee45': 241, 'uniec7c': 242, 'unif6e6': 243, 'uniea45': 244, 'unie51a': 245, 'unif572': 246, 'unif1ed': 247, 'unie04b': 248, 'unie7d0': 249, 'unie7f3': 250, 'unif64b': 251, 'unif099': 252, 'unie7c9': 253, 'unif7a0': 254, 'uniea52': 255, 'unie31c': 256, 'unif4a1': 257, 'unieca9': 258, 'unie4f6': 259, 'uniee70': 260, 'unie10f': 261, 'unif389': 262, 'unif8e2': 263, 'unie6c9': 264, 'uniea13': 265, 'unif020': 266, 'unie3b7': 267, 'unief83': 268, 'unif720': 269, 'unif1a2': 270, 'unie50c': 271, 'unif59b': 272, 'unieff5': 273, 'unie0e9': 274, 'unie22d': 275, 'unif196': 276, 'unie37a': 277, 'unieaf5': 278, 'unie54c': 279, 'uniec09': 280, 'unie75f': 281, 'unie177': 282, 'unieb89': 283, 'unie03d': 284, 'uniecb2': 285, 'uniea3a': 286, 'unied35': 287, 'uniee43': 288, 'unie5ed': 289, 'unie99e': 290, 'unif616': 291, 'unif395': 292, 'unie522': 293, 'unif281': 294, 'uniea62': 295, 'unieac0': 296, 'unie111': 297, 'unif45d': 298, 'unie9ed': 299, 'unie7b2': 300, 'unif10c': 301, 'unie0ca': 302, 'unif10e': 303, 'unie687': 304, 'unie74f': 305, 'unif250': 306, 'unie110': 307, 'uniee02': 308, 'unif06f': 309, 'unie602': 310, 'unif378': 311, 'unif2f1': 312, 'unif664': 313, 'unif263': 314, 'unif6b8': 315, 'unie936': 316, 'unie984': 317, 'unie521': 318, 'unif67e': 319, 'unie3d3': 320, 'unie277': 321, 'unie811': 322, 'uniede0': 323, 'unif02b': 324, 'unif3e0': 325, 'unif3f5': 326, 'unief75': 327, 'unif2e2': 328, 'unif755': 329, 'unif474': 330, 'unie43f': 331, 'unie747': 332, 'unif653': 333, 'unif067': 334, 'unie31e': 335, 'unied6e': 336, 'unie0e6': 337, 'unif6c5': 338, 'unie07a': 339, 'unif26b': 340, 'unif8e0': 341, 'unif883': 342, 'unif74b': 343, 'unie7a3': 344, 'unie3f5': 345, 'unif346': 346, 'unie494': 347, 'unif86c': 348, 'unie5b7': 349, 'uniedd9': 350, 'unie1c8': 351, 'unie452': 352, 'uniecd1': 353, 'unif77e': 354, 'uniea58': 355, 'unif677': 356, 'unif14c': 357, 'unie148': 358, 'unie496': 359, 'unie766': 360, 'uniee8b': 361, 'unif5a4': 362, 'unie69e': 363, 'unie0f3': 364, 'unieea5': 365, 'unif5ea': 366, 'unie2c5': 367, 'unie3f8': 368, 'unie462': 369, 'unied21': 370, 'unif8ac': 371, 'unie71a': 372, 'uniec24': 373, 'unif0d5': 374, 'unied15': 375, 'unie3a7': 376, 'unif06c': 377, 'unie246': 378, 'unif52b': 379, 'unie49c': 380, 'unie9f6': 381, 'unif308': 382, 'unief9f': 383, 'unie6bd': 384, 'unie8fe': 385, 'unif617': 386, 'uniee14': 387, 'unie26b': 388, 'uniec5d': 389, 'unif0bb': 390, 'unif77f': 391, 'unif2e0': 392, 'uniea6a': 393, 'unie387': 394, 'unie097': 395, 'unie85f': 396, 'unif024': 397, 'unif398': 398, 'unie651': 399, 'unie53d': 400, 'uniea3e': 401, 'unie558': 402, 'unif41b': 403, 'unif85d': 404, 'unied3c': 405, 'unieb34': 406, 'uniec79': 407, 'unie682': 408, 'uniec88': 409, 'unif3ed': 410, 'unie79a': 411, 'unie740': 412, 'unie612': 413, 'unieebd': 414, 'unif8c0': 415, 'unif0db': 416, 'unie3b2': 417, 'unie6b8': 418, 'unif88c': 419, 'unieb3f': 420, 'unie5c7': 421, 'unif31d': 422, 'unie42d': 423, 'unif56d': 424, 'uniedaf': 425, 'unif4a2': 426, 'unie76d': 427, 'unif5aa': 428, 'uniea15': 429, 'unif88b': 430, 'unif52c': 431, 'unied7a': 432, 'unif3c2': 433, 'unif200': 434, 'unie08e': 435, 'uniea39': 436, 'unie9ff': 437, 'unief4e': 438, 'unie537': 439, 'unif198': 440, 'unieafb': 441, 'unif67b': 442, 'uniecd4': 443, 'unie3f7': 444, 'unied7d': 445, 'unief8e': 446, 'unie6b5': 447, 'unif494': 448, 'unif0d0': 449, 'unie7f8': 450, 'unie789': 451, 'unie50a': 452, 'unie970': 453, 'unif492': 454, 'uniebb9': 455, 'unif097': 456, 'unif424': 457, 'unif5ec': 458, 'uniee31': 459, 'unie14e': 460, 'unif566': 461, 'unif836': 462, 'unif471': 463, 'unie8c3': 464, 'unif895': 465, 'unie92f': 466, 'unif7db': 467, 'unif51c': 468, 'unieb2b': 469, 'uniec4d': 470, 'unieabc': 471, 'unie83e': 472, 'unie2b1': 473, 'unif6d6': 474, 'unie1f5': 475, 'unif5a0': 476, 'unif892': 477, 'unif187': 478, 'unied4d': 479, 'uniec11': 480, 'uniecf5': 481, 'unieff2': 482, 'uniea00': 483, 'unif4a5': 484, 'unieb12': 485, 'unie531': 486, 'unie841': 487, 'unie78d': 488, 'unie72e': 489, 'unief52': 490, 'unif039': 491, 'uniebc8': 492, 'uniecac': 493, 'unif1c4': 494, 'unif391': 495, 'unie3ec': 496, 'unie4b6': 497, 'unief0e': 498, 'unie621': 499, 'unif50d': 500, 'unieebb': 501, 'unif86a': 502, 'unif7f4': 503, 'unif23a': 504, 'unieb0e': 505, 'unie770': 506, 'unie5b5': 507, 'unif2ac': 508, 'unif679': 509, 'unif030': 510, 'unie5f5': 511, 'unie1fb': 512, 'uniefc8': 513, 'unie5ea': 514, 'unif48f': 515, 'unie3de': 516, 'unie029': 517, 'unie880': 518, 'unif787': 519, 'unif69e': 520, 'unieaa7': 521, 'unif8b2': 522, 'unie596': 523, 'unif633': 524, 'unie906': 525, 'unieed9': 526, 'uniec2f': 527, 'unie568': 528, 'unieb27': 529, 'unie9b1': 530, 'unie69c': 531, 'unif10d': 532, 'unif19b': 533, 'unif0e9': 534, 'unie432': 535, 'uniea57': 536, 'unie457': 537, 'unie931': 538, 'unif8a2': 539, 'unie6c4': 540, 'unied46': 541, 'uniec1f': 542, 'uniebd7': 543, 'unie910': 544, 'uniedde': 545, 'unie3b6': 546, 'uniec91': 547, 'unie2e6': 548, 'unieafa': 549, 'unie7a9': 550, 'uniec86': 551, 'unie9d8': 552, 'unief20': 553, 'unie64b': 554, 'uniea07': 555, 'unie201': 556, 'unie83f': 557, 'unif7bf': 558, 'uniec39': 559, 'unif318': 560, 'unief23': 561, 'unie374': 562, 'unif1db': 563, 'unie827': 564, 'unieeb8': 565, 'unif107': 566, 'unie095': 567, 'unieda5': 568, 'unif478': 569, 'unif8f5': 570, 'unie2be': 571, 'uniedcd': 572, 'unif642': 573, 'unied92': 574, 'unie326': 575, 'uniec12': 576, 'unif8ec': 577, 'uniecb7': 578, 'unif415': 579, 'unie1cf': 580, 'unif620': 581, 'unie8cd': 582, 'unieb4b': 583, 'unie0c2': 584, 'unie694': 585, 'unie0ba': 586, 'unie85b': 587, 'unie1a7': 588, 'uniefdd': 589, 'unif5fb': 590, 'unie296': 591, 'unie2b7': 592, 'uniec52': 593, 'unie00c': 594, 'unif2db': 595, 'unie97b': 596, 'uniec38': 597, 'unif339': 598, 'unif3f9': 599, 'unif8f2': 600, 'unie3bd': 601, 'unie4ba': 602}

hanzi_list=[" "," " ,"1","2","3","4","5","6","7","8","9","0","店","中","美","家","馆","小","车","大","市","公","酒","行","国","品","发","电","金","心","业","商","司","超","生","装","园","场","食","有","新","限","天","面","工","服","海","华","水","房","饰","城","乐","汽","香","部","利","子","老","艺","花","专","东","肉","菜","学","福","饭","人","百","餐","茶","务","通","味","所","山","区","门","药","银","农","龙","停","尚","安","广","鑫","一","容","动","南","具","源","兴","鲜","记","时","机","烤","文","康","信","果","阳","理","锅","宝","达","地","儿","衣","特","产","西","批","坊","州","牛","佳","化","五","米","修","爱","北","养","卖","建","材","三","会","鸡","室","红","站","德","王","光","名","丽","油","院","堂","烧","江","社","合","星","货","型","村","自","科","快","便","日","民","营","和","活","童","明","器","烟","育","宾","精","屋","经","居","庄","石","顺","林","尔","县","手","厅","销","用","好","客","火","雅","盛","体","旅","之","鞋","辣","作","粉","包","楼","校","鱼","平","彩","上","吧","保","永","万","物","教","吃","设","医","正","造","丰","健","点","汤","网","庆","技","斯","洗","料","配","汇","木","缘","加","麻","联","卫","川","泰","色","世","方","寓","风","幼","羊","烫","来","高","厂","兰","阿","贝","皮","全","女","拉","成","云","维","贸","道","术","运","都","口","博","河","瑞","宏","京","际","路","祥","青","镇","厨","培","力","惠","连","马","鸿","钢","训","影","甲","助","窗","布","富","牌","头","四","多","妆","吉","苑","沙","恒","隆","春","干","饼","氏","里","二","管","诚","制","售","嘉","长","轩","杂","副","清","计","黄","讯","太","鸭","号","街","交","与","叉","附","近","层","旁","对","巷","栋","环","省","桥","湖","段","乡","厦","府","铺","内","侧","元","购","前","幢","滨","处","向","座","下","臬","凤","港","开","关","景","泉","塘","放","昌","线","湾","政","步","宁","解","白","田","町","溪","十","八","古","双","胜","本","单","同","九","迎","第","台","玉","锦","底","后","七","斜","期","武","岭","松","角","纪","朝","峰","六","振","珠","局","岗","洲","横","边","济","井","办","汉","代","临","弄","团","外","塔","杨","铁","浦","字","年","岛","陵","原","梅","进","荣","友","虹","央","桂","沿","事","津","凯","莲","丁","秀","柳","集","紫","旗","张","谷","的","是","不","了","很","还","个","也","这","我","就","在","以","可","到","错","没","去","过","感","次","要","比","觉","看","得","说","常","真","们","但","最","喜","哈","么","别","位","能","较","|","境","非","为","欢","然","他","挺","着","价","那","意","种","想","出","员","两","推","做","|","排","实","分","间","甜","度","起","满","给","热","完","格","荐","喝","等","其","再","几","只","现","朋","候","样","直","而","买","于","般","豆","量","选","奶","打","每","评","少","算","又","因","情","找","些","份","置","适","什","蛋","师","气","你","姐","棒","试","总","定","啊","足","级","整","带","虾","如","态","且","尝","主","话","强","当","更","板","知","己","无","酸","让","入","啦","式","|","笑","赞","片","酱","差","像","提","队","走","嫩","才","刚","午","接","重","串","回","晚","微","周","值","费","性","桌","拍","跟","块","调","糕"]


shuzi_num={'glyph00000': 0, 'x': 1, 'unif5db': 2, 'unif1c1': 3, 'unif8c1': 4, 'unie501': 5, 'unied3d': 6, 'unif828': 7, 'unie0d5': 8, 'unie48b': 9, 'unie4a9': 10, 'unief3b': 11, 'unif5f1': 12, 'unie309': 13, 'uniea1b': 14, 'unif3bf': 15}
shuzi_list=['','','1','2','3','4','5','6','7','8','9','0']

print(hanzi_list.index('河'))

from lxml import    etree
user_agent=UserAgent()
ua=user_agent.random
headers={
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding":"gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Referer" : "http://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/c8452b46b93efe6c8d71f25cbf3fdcf7.css",
    "User-Agent" : "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36",
    "Cookie": "fspop=student; _lxsdk_cuid=17526568150c8-057f2e4e9b0bd1-3323767-151800-17526568150c8; _lxsdk=17526568150c8-057f2e4e9b0bd1-3323767-151800-17526568150c8; _hc.v=f89b5402-1920-6473-8ea3-fe6140109f50.1602666007; s_ViewType=10; _dp.ac.v=482bf0af-d2c4-4521-aa86-05e03cf05d01; ua=%E5%8C%85%E9%9D%92%E5%A4%A9_8258; ctu=092729311bdbf5b249043bc590d0ca4370c52e82ad59fb7de1fdf41599181d0a; aburl=1; cityInfo=%7B%22cityId%22%3A9%2C%22cityName%22%3A%22%E9%87%8D%E5%BA%86%22%2C%22provinceId%22%3A0%2C%22parentCityId%22%3A0%2C%22cityOrderId%22%3A0%2C%22isActiveCity%22%3Afalse%2C%22cityEnName%22%3A%22chongqing%22%2C%22cityPyName%22%3Anull%2C%22cityAreaCode%22%3Anull%2C%22cityAbbrCode%22%3Anull%2C%22isOverseasCity%22%3Afalse%2C%22isScenery%22%3Afalse%2C%22TuanGouFlag%22%3A0%2C%22cityLevel%22%3A0%2C%22appHotLevel%22%3A0%2C%22gLat%22%3A0%2C%22gLng%22%3A0%2C%22directURL%22%3Anull%2C%22standardEnName%22%3Anull%7D; cy=9; cye=chongqing; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1602766315,1602768257,1602853468,1602896940; dper=d2eae47993f8001491f6ff1594faa3b4fb7b929b9b0cfbfc96138e514da123f235eca63223dcd5dbbc8fb1e7a4d8dca8ccb7fb8dea90f404a9da62da0e4314237178bba721e33b13d28c560db470a60cd3841932e1f836250c4052606f36878d; ll=7fd06e815b796be3df069dec7836c3df; uamo=15978973646; dplet=3685a688f2538b66c64c3f6a68877c19; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1602898122; _lxsdk_s=175341a153c-b70-254-d0f%7C%7C149",
    "Host": "www.dianping.com",
    "Upgrade-Insecure-Requests": "1"

}
file=open("火锅.txt","w",encoding='utf-8')

# with open("页面url.txt") as lines:
    # for line in lines:
for i in range(1):

    # print(line)
    food_list = []
    content_list = []
    price_list = []
    shop_name_list = []
    shop_add = []
    # url=line
    url='http://www.dianping.com/chongqing/ch10/g110p3'

    res = requests.get(url=url, headers=headers)
    time.sleep(1)
    html_res = etree.HTML(res.text)

    price=re.findall('<b>¥(.*?)</b>', res.text)   # 拿商铺的人均价格
    all_b=re.findall('<b>(.*?)</b>',res.text)   # b 标签里有价格 评论 推荐菜

    food=re.findall('data-click-name="shop_tag_dish_click"(.*?)</a>',res.text)  # 拿商铺的推荐菜

    shop_name=re.findall(' <img title(.*?)data-src',res.text) # 拿商铺的名字

    addrs = re.findall('class="addr">(.*?)</span>', res.text)
    # print(addrs)

    # 拿商铺名字
    for i in shop_name:
        name=re.findall('alt="(.*?)"',i)[0]
        shop_name_list.append(name)


    # 拿商铺推荐菜
    for i in food:
        un=re.findall('(.*?)>',i)
        i=i.replace('{}'.format(un[0]), '').replace('>','')
        food_list.append(i)
    content=all_b[::5]  #  跳着选第五个 都是评论

    def get_real_worlds(add):  # 此方法将反爬数字转为正常数字
        print(add)
        hanzi = ""
        for unie in add:
            print(unie)
        #     # hanzi=""
        #     # print(unie)
            if unie in final_num.keys():
                print("该在列表那",unie)
                id = final_num[unie]

                hanzi = hanzi + hanzi_list[id]

            else:
                print("不在里面",unie)

                unie = list(unie)
                # print(nn)
                str = unie[0:7] #str是'unie34d碑' 期几个 在加上后几个中文
                str1=''
                for i in str:
                    # print(i)
                    str1=str1+i
                # str=re.findall('^u(.*?)')
                # print(str1)
                str2=unie[7:]
                str3=''
                for i in str2:
                    # print(i)
                    str3=str3+i


                if str1 in final_num.keys():
                    # print(final_num[str])

                    id = final_num[str1]
                    hanzi = hanzi + hanzi_list[id]
                    hanzi=hanzi+str3
                    # print(hanzi_list[id])

        print(hanzi)
        shop_add.append(hanzi)




    def get_real_num(i):  # 此方法将反爬数字转为正常数字
        nums = ''
        # print(i)
        i = i.split(",")
        # print(i)
        for uni in i:
            uni = uni.replace('&#x', 'uni')

            if uni in shuzi_num.keys():
                # print("有")
                xb = shuzi_num[uni]
                num = str(shuzi_list[xb])
                nums = nums + num
            else:
                nums = nums + uni
        return nums


    leibie = re.findall('<svgmtsi class="tagName">(.*?)</svgmtsi></span></a>', res.text)
    # for i in leibie:
    #     i = i.replace(';</svgmtsi><svgmtsi class="tagName">', ',').replace('<svgmtsi class="tagName">',',').replace(';</svgmtsi>/', '').replace(';</svgmtsi>', ',').replace(';',',')
    #     i = i.replace("&#x", "uni")
    #     i = i.split(",")
    #     # print(i)
    #     get_real_worlds(i)
    # print(leibie)
    # print(len(leibie))

    #评价
    for i in content:
        i=i.replace('<svgmtsi class="shopNum">',',').replace(';</svgmtsi>',',')

        nums=get_real_num(i)
        content_list.append(nums)
    #价钱
    for i in price:
        # print(i)
        i=i.replace('<svgmtsi class="shopNum">',',').replace(';</svgmtsi>',',')
        nums=get_real_num(i)
        price_list.append(nums)
    for add in addrs:

        # add = add.replace('<svgmtsi class="address">', ',').replace('</svgmtsi>', ',')
        add = add.replace('<svgmtsi class="address">', ',').replace('</svgmtsi>', '').replace('(','').replace(')','').replace(";","")
        add = add.replace("&#x", "uni")
        # print(add)
        add = add.split(",")
        # add = add.split("'")
        # print(add)


        get_real_worlds(add)

    
    print(content_list)
    print(food_list)
    print(price_list)
    print(shop_name_list)
    print(shop_add)


    for i in range(15):
        file.write("名字:"+shop_name_list[i])
        file.write("  价格:"+price_list[i])
        file.write("  评价:"+content_list[i])

        food=""
        for s in food_list[i:i+3]:
            food=food+s+","



        file.write("  推荐菜: "+food)
        file.write("  地址:" + shop_add[i])
        file.write("\n")
file.close()

在这里插入图片描述
在这里插入图片描述

最后的15条示例数据保存在一个文件 成功解决字体反爬
有兴趣的同学买下代理ip就可以爬取全网了

  • 3
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值