selenium爬取明星微博

# coding--utf8
import selenium
import time
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
import os
import csv
import sys
sys.setrecursionlimit(100000)

from pymongo import MongoClient
host = 'localhost'
port = 27017

client = MongoClient(host, port)
dlrb = client['dlrb']           #数据库名称
sheet_tab = dlrb['sheet_tab']

#爬取迪丽热巴的全部微博并存储到mongoDB中,发布时间,内容,转发量,评论数,点赞数,链接,一共25页
#def driver

driver = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")

def q(page_num):
    return f'https://weibo.com/u/1669879400?is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page={page_num}#feedtop'


def scroll_pages():
    html = driver.find_element_by_tag_name('html')
    for i in range(50):
        html.send_keys(Keys.END)
        time.sleep(1)

def get_page_contents():
    cards     = driver.find_elements_by_css_selector('div.WB_detail')
    handles   = driver.find_elements_by_css_selector('div.WB_feed_handle')
    info_list1 = []
    for card in cards:
        push_time = card.find_element_by_css_selector('div.WB_from.S_txt2 > a:nth-child(1)').text
        content   = card.find_element_by_css_selector('div.WB_text.W_f14').text
        link      = card.find_element_by_css_selector('div.WB_from.S_txt2 > a:nth-child(1)').get_attribute('href')
        info_list1.append([push_time, content, link])
    info_list2 = []
    for handle in handles:
        transfer  = handle.find_element_by_css_selector('div > ul > li:nth-child(2) > a > span > span > span > em:nth-child(2)').text
        comment   = handle.find_element_by_css_selector('div > ul > li:nth-child(3) > a > span > span > span > em:nth-child(2)').text
        like      = handle.find_element_by_css_selector('div > ul > li:nth-child(4) > a > span > span > span > em:nth-child(2)').text
        info_list2.append([transfer, comment, like])
#用expend将列表拼接
    for i in range(len(info_list1)):
        info_list1[i].extend(info_list2[i])
    info_list = info_list1
    return info_list     #把info_list传递出去

#info_list格式[[push_time, content, link, transfer, comment, like],[],[],[]],mongoDB只能存储字典
def save(info_list):
    for item in info_list:
        data = {'push_time' : item[0],
                'content'  : item[1],
                'link': item[2],
                'transfer': item[3],
                'comment' : item[4],
                'like'  :item[5]
        }
        sheet_tab.insert_one(data)

 for i in range(1,26):
    driver.get(q(i))
    input()
    scroll_pages()
    time.sleep(1)
    info_list = get_page_contents()
    save(info_list)
    print(info_list)
    print(len(info_list))
    time.sleep(10)


#注意:从mongoDB导出数据时会出现乱码,用记事本打开csv,另存为asn编码即可
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值