源码如下:
import requests
import re
from my_fake_useragent import UserAgent
class Gou:
def __init__(self, html):
# 获取页面
self.html = html
pass
def __call__(self, *args, **kwargs):
# 内置函数调用main函数
self.main()
# 程序主函数
def main(self):
# 匹配出所有的狗子
res = re.findall('<li\sdata-aid.*?>(.*?)</li>', self.html)
for i in res:
# print(i)
# 匹配出比标题
rule1 = r"'.*?alt='(.*?)'.*?"
res1 = re.findall(rule1, i)
# print(res1)
# 狗子图片
rule2 = ".*?data-originSource='(.*?)'"
res2 = re.findall(rule2, i)
# print(res2)
# 价格
rule3 = ".*?<span class='highlight'>(.*?)</span>"
res3 = re.findall(rule3, i)
# print(res3)
# 地址
rule4 = ".*?<div class='ad-item-detail'>(.*?)</div>.*?"
res4 = re.findall(rule4, i)
# print(res4[0])
# 输处
f = f"狗子标题:{res1},\n狗子图片链接:{res2},\n狗子价格:{res3},\n狗子地址:{res4[0]}"
print(f)
print()
pass
if __name__ == '__main__':
# 定义页数
page = 1
while True:
# 拼接页数
print(f"第{page}页")
url = f'https://beijing.baixing.com/chongwujiaoyi/m177986/?entities=%E6%80%A7%E5%88%AB_%E5%85%AC&page={page}&%E4%BB%B7%E6%A0%BC%5B0%5D=1000&%E4%BB%B7%E6%A0%BC%5B1%5D=1100&%E5%B9%B4%E9%BE%84%5B0%5D=0&%E5%B9%B4%E9%BE%84%5B1%5D='
headers = {
"cookie": "__trackId=161770863848743; __city=beijing; __s=c5hnjhfuv952mpvdhl21ehgn82; Hm_lvt_5a727f1b4acc5725516637e03b07d3d2=1617708683; _ga=GA1.2.8320274.1617708683; _gid=GA1.2.703774208.1617708683; _auth_redirect=https%3A%2F%2Fbeijing.baixing.com%2Fchongwujiaoyi%2Fm177986%2F%3Fentities%3D%25E6%2580%25A7%25E5%2588%25AB_%25E5%2585%25AC%26%25E4%25BB%25B7%25E6%25A0%25BC%255B0%255D%3D1000%26%25E4%25BB%25B7%25E6%25A0%25BC%255B1%255D%3D1100%26%25E5%25B9%25B4%25E9%25BE%2584%255B0%255D%3D0%26%25E5%25B9%25B4%25E9%25BE%2584%255B1%255D%3D; __sense_session_pv=6; Hm_lpvt_5a727f1b4acc5725516637e03b07d3d2=1617708737",
"if-modified-since": "Tue, 06 Apr 2021 11:32:16 GMT",
"referer": "https://beijing.baixing.com/oz/s9verify_html?identity=spider_1&redirect=https%3A%2F%2Fbeijing.baixing.com%2Fchongwujiaoyi%2Fm177986%2F%3Fentities%3D%25E6%2580%25A7%25E5%2588%25AB_%25E5%2585%25AC%26%25E4%25BB%25B7%25E6%25A0%25BC%255B0%255D%3D1000%26%25E4%25BB%25B7%25E6%25A0%25BC%255B1%255D%3D1100%26%25E5%25B9%25B4%25E9%25BE%2584%255B0%255D%3D0%26%25E5%25B9%25B4%25E9%25BE%2584%255B1%255D%3D&scene=spider_1",
"upgrade-insecure-requests": "1",
"user-agent": UserAgent().random()
}
# 获取页面
response = requests.get(url, headers=headers)
html = response.text
# 实例化对象和调用
Gou(html)()
# 判断是否为最后一页
if "下一页" not in html:
break
page += 1
这里爬取的值阿拉斯加,有意者可以参考源码怕别的狗子