要爬取的页面及链接
https://bss.csdn.net/m/topic/blog_star2020
第一种方法(selenium)
爬虫使用的是selenium,插件工具使用的是xpath_helper。
![](https://i-blog.csdnimg.cn/blog_migrate/da7e4d41e18f54fcd4d78d84e77047d3.png)
使用xpath定位元素
完整代码
from selenium import webdriver
driver = webdriver.Chrome()
driver.implicitly_wait(10) # 等待页面元素加载完毕再开始爬取
url = "https://bss.csdn.net/m/topic/blog_star2020"
driver.get(url)
indexs = driver.find_elements_by_xpath('//*[@id="blogList"]/li/a/span') #博主投票序号
names = driver.find_elements_by_xpath('//*[@id="blogList"]/li/a/div[2]') #博主名称
numbers = driver.find_elements_by_xpath('//*[@id="blogList"]/li/a/div[4]/p[2]/em') #博主票数
urls = driver.find_elements_by_xpath('//*[@id="blogList"]/li/a') #投票链接
data = []
for i in range(len(indexs)):
d = {}
d['index'] = indexs[i].text # text取出文本内容
d['name'] = names[i].text
d['number'] = int(numbers[i].text)
d['url'] = urls[i].get_attribute('href') # 取出a标签的href链接
data.append(d)
data = sorted(data,key=lambda x: x['number'],reverse=True) # 对票数进行排序
# 将tr td添加到每条记录
ii = 0
for i in dataSort:
ii += 1
print("<tr>")
print("<td>{}</td>".format(ii)) # 名次
print("<td>{}</td>".format(i['name'])) # 博主
print("<td>{}</td>".format(i['index'])) # 投票序号
print("<td>{}</td>".format(i['number'])) # 得票数量
print("<td><a href='{}'>{}</a></td>".format(i['url'],i['url'])) # 投票地址
print("</tr>")
driver.close() # 关闭浏览器
输出效果(内容较多 只复制前三):
<tr>
<td>1</td>
<td>Hollis在csdn</td>
<td>070</td>
<td>3893</td>
<td><a href='https://bss.csdn.net/m/topic/blog_star2020/detail?username=hollis_chuang'>https://bss.csdn.net/m/topic/blog_star2020/detail?username=hollis_chuang</a></td>
</tr>
<tr>
<td>2</td>
<td>帅地</td>
<td>124</td>
<td>3454</td>
<td><a href='https://bss.csdn.net/m/topic/blog_star2020/detail?username=m0_37907797'>https://bss.csdn.net/m/topic/blog_star2020/detail?username=m0_37907797</a></td>
</tr>
<tr>
<td>3</td>
<td>敖 丙</td>
<td>014</td>
<td>3300</td>
<td><a href='https://bss.csdn.net/m/topic/blog_star2020/detail?username=qq_35190492'>https://bss.csdn.net/m/topic/blog_star2020/detail?username=qq_35190492</a></td>
</tr>
第二种方法(requests)
请求数据
点击查看大图
完整代码
import requests # 如果没有requests 包 直接pip install requests 安装
# 请求地址
url = 'https://bss.csdn.net/m/topic/blog_star2020/getUsers'
# 请求头
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Mobile Safari/537.36'}
# 要传的参数
data = {'number': ''}
response = requests.post(url,headers,data)
data = json.loads(response.text)['data']
data = sorted(data,key=lambda x:x['vote_num'],reverse=True)
dataCsdn=list()
j=0
for i in data:
j+=1
d = {}
d['名次'] = j
d['博主']=i['nick_name']
d['票数']=i['vote_num']
d['投票地址']=i['url']
d['投票序号']=i['number']
d['博客等级']=i['level']
d['码龄']=i['codeLevel']
d['原创']=i['brief']
dataCsdn.append(d)
""" 解释 """
# post请求
ret = requests.post(url,headers,data)
print(ret) # 输出:<Response [200]>(请求成功,返回200)
print(ret.text)
# 输出(内容太长,复制部分):
'\n{"code":200,"msg":"ok","data":[{"id":"3260","title":"qq_26525215","vote_num":2263,"url":"https:\\/\\/bss.csdn.net\\/m\\/topic\\/blog_star2020\\/detail?username=qq_26525215","img":"","brief":"166","class_id":"95","logs":true,"level":8,"codeLevel":6,"nick_name":"\\u8c19\\u5fc6","avatar":"https:\\/\\/profile.csdnimg.cn\\/F\\/7\\/6\\/1_qq_26525215","article_count":166,"nameWords":null,"number":"001"}]}
# 使用json.loads将数据转回原类型
print(json.loads(ret.text))
# 输出(得票字段vote_num, 博主字段nick_name, 原创文章字段brief,码龄字段codeLevel,投票地址字段url,序号字段number 等等):
{'data': [{'article_count': 166,
'avatar': 'https://profile.csdnimg.cn/F/7/6/1_qq_26525215',
'brief': '166',
'class_id': '95',
'codeLevel': 6,
'id': '3260',
'img': '',
'level': 8,
'logs': True,
'nameWords': None,
'nick_name': '谙忆',
'number': '001',
'title': 'qq_26525215',
'url': 'https://bss.csdn.net/m/topic/blog_star2020/detail?username=qq_26525215',
'vote_num': 2263}]}
内容粘贴
排行榜
更新时间 1-25 投票已截至
进程完成,退出码 0