爬取网红主播信息

# -*- coding: utf-8 -*-
"""
Created on Tue Nov  2 10:21:05 2021

@author: yiting.liu
"""


import requests
from bs4 import BeautifulSoup
site = 'http://www.dwanghong.com'
url = 'http://www.dwanghong.com/rank/'
r = requests.get(url).content.decode('utf-8')
soup = BeautifulSoup(r)
result = soup.find('div', class_='nav-tags').find_all('li')
platform_list = []
for i in result:
    platform_href = i.find(['a'])['href']
    platform_name = i.find(['a']).text
    platform_list.append([platform_name,platform_href])

wanghong_list_all = []
for j in platform_list:
    print('----------正在浏览 %s ----------'%(j[0]))
    site_href = site+j[1]
    r_href = requests.get(site_href).content.decode('utf-8')
    soup_href = BeautifulSoup(r_href)
    result_href = soup_href.find('div', class_='content-list').find_all('li')
    wanghong_list = []
    for k in result_href:
        wanghong_href = k.find(['a'])['href']
        wanghong_name = k.find(['a'])['title']
        wanghong_top = k.find(['em']).text
        wanghong_list.append([j[0],wanghong_top,wanghong_name,wanghong_href])
        print('----------正在浏览 %s 下,%s - %s----------'%(j[0],wanghong_top,wanghong_name))
    wanghong_list_all = wanghong_list_all + wanghong_list

len(wanghong_list_all)


wanghong_info_list = []
for m in wanghong_list_all:
    print('----------正在浏览 %s %s 主播 %s ----------'%(m[0],m[1],m[2]))
    site_wanghong_href = site+m[3]
    r_wanghong_href = requests.get(site_wanghong_href).content.decode('utf-8')
    soup_wanghong_href = BeautifulSoup(r_wanghong_href)
    result_wanghong_href = soup_wanghong_href.find('div', class_='flower-info').find_all('dl')

    chinese_name = result_wanghong_href[0].find(['dd']).text
    other_name = result_wanghong_href[1].find(['dd']).text
    birth_place = result_wanghong_href[2].find(['dd']).text
    birth_day = result_wanghong_href[3].find(['dd']).text
    body_high = result_wanghong_href[4].find(['dd']).text
    live_platform = result_wanghong_href[5].find(['dd']).text
    live_type = result_wanghong_href[6].find(['dd']).text
    liveroom_id = result_wanghong_href[7].find(['dd']).text
    live_time = result_wanghong_href[8].find(['dd']).text
    fans_count = result_wanghong_href[9].find(['dd']).text
    weibo = result_wanghong_href[10].find(['dd']).text
    tieba = result_wanghong_href[11].find(['dd']).text
    update_time = result_wanghong_href[12].find(['dd']).text
    info = {'中文名':chinese_name,
            '别名':other_name,
            '出生地点':birth_place,
            '出生日期':birth_day,
            '身高':body_high,
            '直播平台':live_platform,
            '直播类型':live_type,
            '直播间id':liveroom_id,
            '直播时间':live_time,
            '粉丝数量':fans_count,
            '新浪微博':weibo,
            '百度贴吧':tieba,
            '更新时间':update_time,
            '平台':m[0],
            'TOP排名':m[1],
            'href':site_wanghong_href}
    wanghong_info_list.append(info)
    print('---------- 完成浏览 %s 平台下,%s-%s ----------'%(m[0],m[1],m[2]))

# import datetime 
from datetime import datetime 
today = datetime.now().strftime('%Y-%m-%d')  

import pandas as pd
df_flower = pd.DataFrame(wanghong_info_list)
df_flower.to_csv('网红数据_%s.csv'%today)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值