仅供学习,别搞其他事
前几天一觉醒来发现群里这么一条消息:
感觉这个还不简单?干就完了
打开网站,抓个登录的包(不会写模拟登录,就是菜),发现店名还是很好抓的
但是电话得点到具体店家链接才能看到
这思路很清晰了呀,那就是先抓店家和链接一个表,再通过具体链接来访问店家来抓他电话
但是我还是大意了
这电话明显是加了反爬了
呵呵,不会了
百度一下,css加密可以呀,参考一波爬虫实例—爬取大众点评信息_Migrant workers-CSDN博客
没写到数据库里面(因为不会),直接打印出来的
后面断断续续搞了两天,李奶奶的太菜了(直接抓包放的)
# 作者:juju
# 日期:2021/9/17 15:12
import requests
from bs4 import BeautifulSoup
from d import get_phone_num
def get_i(i):
cookies = {
'fspop': 'test',
'cy': '23',
'cye': 'haikou',
'_lx_utm': 'utm_source%3DBaidu%26utm_medium%3Dorganic',
'_lxsdk_cuid': '17bf28ccf05c8-0b28b6685b9a86-1b337040-144000-17bf28ccf054d',
'_lxsdk_s': '17bf28ccf06-635-7d1-3c5%7C%7C116',
'_lxsdk': '17bf28ccf05c8-0b28b6685b9a86-1b337040-144000-17bf28ccf054d',
'Hm_lvt_602b80cf8079ae6591966cc70a3940e7': '1631861920',
'Hm_lpvt_602b80cf8079ae6591966cc70a3940e7': '1631862378',
'_hc.v': '607f25f8-ad7c-38b7-4a73-79eca8f1af81.1631861921',
's_ViewType': '10',
'_dp.ac.v': '54123a93-bcb5-4efe-bc74-d7dfb6d2e261',
'thirdtoken': '880720a3-5dd2-4ed2-b111-4cb3f1b35c8f',
'_thirdu.c': '8f9673ecb6879d3d8c297364f1afe74d',
'dplet': '4b0a0fecb04ab08f834e3afc03343b98',
'dper': 'ca0efb15c2f15e7bbd955326e9340a7212e533b24c7f96f9729e8d7c3ad96af72dd5f47a846fdb0d65411f810dfb3306e400aefa2d21043b7b416480df5bf3dd0db110c97897e340595b05a36061591c0257998c8da9dd0003fac97840efd5ce',
'll': '7fd06e815b796be3df069dec7836c3df',
'ua': '%E5%BF%A0%E5%8E%9A%E8%80%81%E5%AE%9E%E7%9A%84%E7%AB%A5%E5%93%A5',
'ctu': '9689bea245262631f40d14a2e74522f9549075c19e859184c077cdae34473da0',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
url_shop_list = 'http://www.dianping.com/search/keyword/23/0_%E5%89%A7%E6%9C%AC%E6%9D%80/p' + str(i)
response = requests.get(url_shop_list, headers=headers, cookies=cookies)
response_r = BeautifulSoup(response.text, 'lxml')
# 获取商店列表
res = response_r.find(id="shop-all-list")
# 分开解析每个商店
list_shop = res.find_all_next('li')
for shop in list_shop:
# 获取店名
shop_img = shop.find('img')
# 获取链接
shop_a = shop.find('a')
# 打印店名和链接和电话号码
shop_url = shop_a.get_attribute_list('href')[0]
print('店家:%s;链接:%s;电话号码:%s' % (shop_img.get_attribute_list('alt')[0], shop_url, get_phone_num(url_shop_list, shop_url)))
# font_css = response_r.findAll('link', rel="stylesheet")
if __name__ == '__main__':
for i in range(2, 4):
get_i(i)
# 作者:juju
# 日期:2021/9/17 17:22
import requests
from bs4 import BeautifulSoup
from fontTools.ttLib import TTFont
def get_phone_num(refer, shop_url):
cookies = {
'fspop': 'test',
'cy': '23',
'cye': 'haikou',
'_lx_utm': 'utm_source%3DBaidu%26utm_medium%3Dorganic',
'_lxsdk_cuid': '17bf28ccf05c8-0b28b6685b9a86-1b337040-144000-17bf28ccf054d',
'_lxsdk': '17bf28ccf05c8-0b28b6685b9a86-1b337040-144000-17bf28ccf054d',
'Hm_lvt_602b80cf8079ae6591966cc70a3940e7': '1631861920',
'Hm_lpvt_602b80cf8079ae6591966cc70a3940e7': '1631870247',
'_hc.v': '607f25f8-ad7c-38b7-4a73-79eca8f1af81.1631861921',
's_ViewType': '10',
'_dp.ac.v': '54123a93-bcb5-4efe-bc74-d7dfb6d2e261',
'thirdtoken': '880720a3-5dd2-4ed2-b111-4cb3f1b35c8f',
'_thirdu.c': '8f9673ecb6879d3d8c297364f1afe74d',
'dplet': '4b0a0fecb04ab08f834e3afc03343b98',
'dper': 'ca0efb15c2f15e7bbd955326e9340a7212e533b24c7f96f9729e8d7c3ad96af72dd5f47a846fdb0d65411f810dfb3306e400aefa2d21043b7b416480df5bf3dd0db110c97897e340595b05a36061591c0257998c8da9dd0003fac97840efd5ce',
'll': '7fd06e815b796be3df069dec7836c3df',
'ua': '%E5%BF%A0%E5%8E%9A%E8%80%81%E5%AE%9E%E7%9A%84%E7%AB%A5%E5%93%A5',
'ctu': '9689bea245262631f40d14a2e74522f9549075c19e859184c077cdae34473da0',
'_lxsdk_s': '17bf2f8e96d-b81-d29-f00%7C%7C126',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Referer': refer,
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'Cache-Control': 'max-age=0',
}
response = requests.get(shop_url, headers=headers, cookies=cookies)
res = BeautifulSoup(response.text.replace('&#x', ''), 'lxml')
# 获取该商店加载的字体反扒css
font_css = res.findAll('link', rel="stylesheet")[1]['href'][2:]
css_text = requests.get('http://' + font_css)
# 从css链接中获取电话号码加载的字体woff链接
font_url = css_text.text.split('.num')
font_woff = font_url[0].split('url("//')[-1][:-5]
# 获取woff文件
headers_woff = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
url_woff = 'http://' + font_woff
res_woff = requests.get(url_woff, headers=headers_woff)
with open(r'C:\Users\juju\Desktop\ffff.woff', "wb+") as code:
code.write(res_woff.content)
code.close()
# 建立映射关系
font = TTFont(r'C:\Users\juju\Desktop\ffff.woff')
list = {}
num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
text = enumerate(num)
for key, value in text:
a = font.getGlyphOrder()[2:][key][3:]
a = a + ';'
list[a] = value
# 将html加密后的电话号码进行替换,用映射关系来代替
res_replace = BeautifulSoup(response.text.replace('&#x', ''), 'lxml')
# 获取电话的标签
tel = res_replace.find('p', class_='expand-info tel').contents[3:]
# 电话一般都是1开头的吧
# 返回电话
phone_num = '1'
for i in tel:
if i.string[-1] == ';':
phone_num = phone_num + list[i.string]
else:
phone_num = phone_num + i.string
return phone_num[:-1]
这个warning还真没搞定,有没有大佬带带我呀
爬虫还是很有意思滴