# coding--utf8
import selenium
import time
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
import os
import csv
import sys
sys.setrecursionlimit(100000)
from pymongo import MongoClient
host = 'localhost'
port = 27017
client = MongoClient(host, port)
dlrb = client['dlrb'] #数据库名称
sheet_tab = dlrb['sheet_tab']
#爬取迪丽热巴的全部微博并存储到mongoDB中,发布时间,内容,转发量,评论数,点赞数,链接,一共25页
#def driver
driver = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
def q(page_num):
return f'https://weibo.com/u/1669879400?is_search=0&visible=0&is_all=1&is_tag=0&profile_ftype=1&page={page_num}#feedtop'
def scroll_pages():
html = driver.find_element_by_tag_name('html')
for i in range(50):
html.send_keys(Keys.END)
time.sleep(1)
def get_page_contents():
cards = driver.find_elements_by_css_selector('div.WB_detail')
handles = driver.find_elements_by_css_selector('div.WB_feed_handle')
info_list1 = []
for card in cards:
push_time = card.find_element_by_css_selector('div.WB_from.S_txt2 > a:nth-child(1)').text
content = card.find_element_by_css_selector('div.WB_text.W_f14').text
link = card.find_element_by_css_selector('div.WB_from.S_txt2 > a:nth-child(1)').get_attribute('href')
info_list1.append([push_time, content, link])
info_list2 = []
for handle in handles:
transfer = handle.find_element_by_css_selector('div > ul > li:nth-child(2) > a > span > span > span > em:nth-child(2)').text
comment = handle.find_element_by_css_selector('div > ul > li:nth-child(3) > a > span > span > span > em:nth-child(2)').text
like = handle.find_element_by_css_selector('div > ul > li:nth-child(4) > a > span > span > span > em:nth-child(2)').text
info_list2.append([transfer, comment, like])
#用expend将列表拼接
for i in range(len(info_list1)):
info_list1[i].extend(info_list2[i])
info_list = info_list1
return info_list #把info_list传递出去
#info_list格式[[push_time, content, link, transfer, comment, like],[],[],[]],mongoDB只能存储字典
def save(info_list):
for item in info_list:
data = {'push_time' : item[0],
'content' : item[1],
'link': item[2],
'transfer': item[3],
'comment' : item[4],
'like' :item[5]
}
sheet_tab.insert_one(data)
for i in range(1,26):
driver.get(q(i))
input()
scroll_pages()
time.sleep(1)
info_list = get_page_contents()
save(info_list)
print(info_list)
print(len(info_list))
time.sleep(10)
#注意:从mongoDB导出数据时会出现乱码,用记事本打开csv,另存为asn编码即可
selenium爬取明星微博
最新推荐文章于 2023-05-11 13:59:21 发布