【无标题】

最新推荐文章于 2024-09-11 09:53:45 发布

passionfruit28175

最新推荐文章于 2024-09-11 09:53:45 发布

阅读量151

点赞数 1

文章标签：信息可视化

本文链接：https://blog.csdn.net/m0_74089390/article/details/140365914

版权

#爬虫
import requests
import parsel
url="https://www.dianping.com/beijing/ch10"
headers={
"Cookie":"_lxsdk_cuid=190a0abb540c8-05e1efd44f241a-4c657b58-1bcab9-190a0abb540c8; _lxsdk=190a0abb540c8-05e1efd44f241a-4c657b58-1bcab9-190a0abb540c8; _hc.v=a8046bef-107d-5ab2-e95e-9e03e726fd2a.1720682528; WEBDFPID=2809uyywvy195u5x06v42yy0uu5377zu809z9zyywu497958xwyy2xv5-2036042529390-1720682526622QCASWKCfd79fef3d01d5e9aadc18ccd4d0c95073730; dplet=4dc9aba815e960d57a29e78733d672d8; dper=02022b207eb9435fc51850b8cb81020fcf4b9746b877f6ba48e7b4b117ae9971047b6fecaa2818a99ce1b574d551e86e8d36ab7701db098eb9690000000050210000704d4f3a275190dc55226fcc343108e72d15d4618b138f8437c696776c6e91fdc28ec9086f477148bd3a728bf667941d; ll=7fd06e815b796be3df069dec7836c3df; ua=%E6%99%BA%E7%9D%BF%E6%B8%85%E9%9B%85%E7%9A%84%E5%B0%8F%E8%AE%B8; ctu=69d1b95ddc93c12c6fd5ca1c2e546bb64266c137716cd322f81f48da559e1b5e; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1720682959; HMACCOUNT=926D514439DF34EF; qruuid=08ed07e8-53f0-4c20-8ac4-1517fddba931; s_ViewType=10; fspop=test; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; cy=2; cye=beijing; _lxsdk_s=190a0f5b997-7e2-49c-414%7C%7C212; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1720690331",
"Host": "www.dianping.com",
"Referer":"https://www.dianping.com/",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
}
response = requests.get(url=url,headers=headers)

selector=parsel.Selector(response.text)
href=selector.css(".shop-list ul li .pic a::attr(href)").getall()

for index in href:
    html_data=requests.get(url=index,headers=headers).text
    #解析数据
    print(html_data)
    selector_eat=parsel.Selector(html_data)
    title=selector_eat.css(".shop-name::text").get()
    information1= selector_eat.css(".breadcrumb a:nth-child(1)::text").get()
    information2=selector_eat.css(".breadcrumb a:nth-child(2)::text").get()
    information3 = selector_eat.css(".breadcrumb a:nth-child(3)::text").get()
    information4 = selector_eat.css(".breadcrumb a:nth-child(4)::text").get()
    score=selector_eat.css(".star-wrapper::text").get()
    commentsum=selector_eat.css("#reviewCount::text").get()
    avgprice=selector_eat.css("#avgPriceTitle::text").get()
    commentscore_eat=selector_eat.css("#comment_score span:nth-child(1)::text").get()
    commentscore_env = selector_eat.css("#comment_score span:nth-child(2)::text").get()
    commentscore_ser = selector_eat.css("#comment_score span:nth-child(3)::text").get()
    print("店名："+title)
    print("省份："+information1)
    print("菜系："+information2)
    print("区域："+information3)
    print("商圈："+information4)
    print("评分："+score)
    print("评论数：" + commentsum)
    print("人均消费：" + avgprice)
    print("口味评分：" + commentscore_eat)
    print("环境评分：" + commentscore_env)
    print("服务评分：" + commentscore_ser)
    print("")
    #对详情页发送请求并且获取数据

隧道ip代理网站代码

#!/usr/bin/env Python
# -*- coding: utf-8 -*-

import requests

# 隧道域名:端口号
tunnel = "k925.kdltpspro.com:15818"

# 用户名密码方式
username = "t12070331644594"
password = "quc296wx"
proxies = {
    "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
}

# 白名单方式（需提前设置白名单）
# proxies = {
#     "http": "http://%(proxy)s/" % {"proxy": tunnel},
#     "https": "http://%(proxy)s/" % {"proxy": tunnel}
# }

# 要访问的目标网页
target_url = "https://dev.kdlapi.com/testproxy"

# 使用隧道域名发送请求
response = requests.get(target_url, proxies=proxies)

# 获取页面内容
if response.status_code == 200:
    print(response.text)  # 请勿使用keep-alive复用连接(会导致隧道不能切换IP)

以下是二者结合后的最终代码

#!/usr/bin/env Python
# -*- coding: utf-8 -*-
import random
import time

import parsel
import requests

# 隧道域名:端口号
tunnel = "k925.kdltpspro.com:15818"

# 用户名密码方式
username = "t12070331644594"
password = "quc296wx"
proxies = {
    "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
    "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
}

# 白名单方式（需提前设置白名单）
# proxies = {
#     "http": "http://%(proxy)s/" % {"proxy": tunnel},
#     "https": "http://%(proxy)s/" % {"proxy": tunnel}
# }

# 要访问的目标网页
target_url = "https://www.dianping.com/beijing/ch10"
headers={
'cookie':'_lxsdk_cuid=190a0abb540c8-05e1efd44f241a-4c657b58-1bcab9-190a0abb540c8; _lxsdk=190a0abb540c8-05e1efd44f241a-4c657b58-1bcab9-190a0abb540c8; _hc.v=a8046bef-107d-5ab2-e95e-9e03e726fd2a.1720682528; WEBDFPID=2809uyywvy195u5x06v42yy0uu5377zu809z9zyywu497958xwyy2xv5-2036042529390-1720682526622QCASWKCfd79fef3d01d5e9aadc18ccd4d0c95073730; dplet=4dc9aba815e960d57a29e78733d672d8; dper=02022b207eb9435fc51850b8cb81020fcf4b9746b877f6ba48e7b4b117ae9971047b6fecaa2818a99ce1b574d551e86e8d36ab7701db098eb9690000000050210000704d4f3a275190dc55226fcc343108e72d15d4618b138f8437c696776c6e91fdc28ec9086f477148bd3a728bf667941d; ua=%E6%99%BA%E7%9D%BF%E6%B8%85%E9%9B%85%E7%9A%84%E5%B0%8F%E8%AE%B8; ctu=69d1b95ddc93c12c6fd5ca1c2e546bb64266c137716cd322f81f48da559e1b5e; qruuid=08ed07e8-53f0-4c20-8ac4-1517fddba931; s_ViewType=10; fspop=test; ll=7fd06e815b796be3df069dec7836c3df; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1720682959,1720694427,1720704375,1720707584; HMACCOUNT=926D514439DF34EF; _lx_utm=utm_source%3Dso.com%26utm_medium%3Dorganic; cy=2; cye=beijing; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1720718347; _lxsdk_s=190a2cdbc21-9d7-1d6-249%7C%7C31',
"Referer":"https://www.dianping.com/",
#"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0"
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
# 使用隧道域名发送请求
response = requests.get(target_url, headers=headers,proxies=proxies)
# 获取页面内容
if response.status_code == 200:
    print(response.text)  # 请勿使用keep-alive复用连接(会导致隧道不能切换IP)

selector=parsel.Selector(response.text)

href=selector.css(".shop-list ul li .pic a::attr(href)").getall()
print(href)

for index in href:
    html_data=requests.get(url=index,proxies=proxies,headers=headers).text
    #解析数据
    time.sleep(random.uniform(1, 2))
    selector_eat=parsel.Selector(html_data)
    title=selector_eat.css(".shop-name::text").get()
    information1= selector_eat.css(".breadcrumb a:nth-child(1)::text").get()
    information2=selector_eat.css(".breadcrumb a:nth-child(2)::text").get()
    information3 = selector_eat.css(".breadcrumb a:nth-child(3)::text").get()
    information4 = selector_eat.css(".breadcrumb a:nth-child(4)::text").get()
    score=selector_eat.css(".star-wrapper::text").get()
    commentsum=selector_eat.css("#reviewCount::text").get()
    avgprice=selector_eat.css("#avgPriceTitle::text").get()
    commentscore_eat=selector_eat.css("#comment_score span:nth-child(1)::text").get()
    commentscore_env = selector_eat.css("#comment_score span:nth-child(2)::text").get()
    commentscore_ser = selector_eat.css("#comment_score span:nth-child(3)::text").get()
    print("店名："+title)
    print("省份："+information1)
    print("菜系："+information2)
    print("区域："+information3)
    print("商圈："+information4)
    print("评分："+score)
    print("评论数：" + commentsum)
    print("人均消费：" + avgprice)
    print("口味评分：" + commentscore_eat)
    print("环境评分：" + commentscore_env)
    print("服务评分：" + commentscore_ser)
    print("")
    #对详情页发送请求并且获取数据'''