python 爬取微博一页数据

最新推荐文章于 2024-09-01 16:40:34 发布

YouAreMyQuery

最新推荐文章于 2024-09-01 16:40:34 发布

阅读量327

点赞数

分类专栏： python 文章标签： python 爬虫 html5

本文链接：https://blog.csdn.net/YouAreMyQuery/article/details/120397500

版权

python 专栏收录该内容

11 篇文章

订阅专栏

from selenium import webdriver
import time
import pandas as pd

driver = webdriver.Chrome(executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver')
driver.get("http://weibo.com/login.php")
time.sleep(3)
driver.find_element_by_xpath('//*[@id="loginname"]').clear()
driver.find_element_by_xpath('//*[@id="loginname"]').send_keys('微博账号')
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input').clear()
time.sleep(1)
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input').send_keys('微博密码')
time.sleep(1)
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click()
# 20秒手机扫二维码登陆
time.sleep(20)

content_list = []

driver.get("https://m.weibo.cn/search?disable_history=1&disable_hot=0&extparam=8008641010000000000_0_0&luicode=10000011&lfid=23103600258008641010000000000&containerid=100103type%3D557%26t%3D10%26q%3D%E6%9A%B4%E9%9B%A8%E6%B4%AA%E6%B0%B4")
time.sleep(7)

js = "return action=document.body.scrollHeight"
# 初始化现在滚动条所在高度为0
height = 0
# 当前窗口总高度
new_height = driver.execute_script(js)
# 循环开始
while height < new_height:
# 将滚动条调整至页面底部
for i in range(height, new_height, 100):
# print(i)
driver.execute_script('window.scrollTo(0, {})'.format(i))
time.sleep(0.2)
height = new_height
time.sleep(0.1)
new_height = driver.execute_script(js)

time.sleep(10)
li_lis = driver.find_elements_by_xpath("//*[@class='weibo-text']")
time.sleep(1)
print(len(li_lis))
iii = 0
while iii < len(li_lis):
print(iii)
driver.get("https://m.weibo.cn/search?disable_history=1&disable_hot=0&extparam=8008641010000000000_0_0&luicode=10000011&lfid=23103600258008641010000000000&containerid=100103type%3D557%26t%3D10%26q%3D%E6%9A%B4%E9%9B%A8%E6%B4%AA%E6%B0%B4")
time.sleep(5)

time.sleep(10)
li_list = driver.find_elements_by_xpath("//*[@class='weibo-text']")
time.sleep(6)
li_lists = [x for x in li_list]
# print(li_lists[i].text)
time.sleep(10)
try:
# print(li_lists[iii].text) 暴雨洪水造成的损害，依然可以明显看到。但郑州，越发坚强和美丽。郑州·郑州市紫荆山公园
li_lists[iii].click()
except Exception as e:
print(e)
finally:
pass
time.sleep(3)
item = {}
try:
item["content"] = driver.find_element_by_class_name("weibo-text").text
time.sleep(1)
img_url = driver.find_elements_by_xpath("//*[@class='m-auto-list']/li")
imgurls = ""
for i in img_url:
imgs = i.find_element_by_tag_name("img").get_attribute("src")
imgurls += imgs + ";"
item["imgurl"] = imgurls
time.sleep(1)
place = driver.find_elements_by_xpath("//*[@class='url-icon']/following-sibling::span[1]")
places = [q for q in place if "郑" in q.text]
item["place"] = places[0].text
time.sleep(1)
item["forwarding"] = driver.find_element_by_class_name("tab-item").text
time.sleep(1)
item["comment"] = driver.find_element_by_xpath("//*[@class='lite-page-tab']/div[2]").text
time.sleep(1)
timess = driver.find_element_by_class_name("time").text
if timess[0] == "6":
break
item["time"] = driver.find_element_by_class_name("time").text
time.sleep(1)
print(item)
content_list.append(item)
except Exception as e:
print(e)
finally:
iii += 1

# 开始写入数据到excel中
time.sleep(10)
content = [z["content"] for z in content_list]
imgu = [zz["imgurl"] for zz in content_list]
place = [zzz["place"] for zzz in content_list]
forwarding = [zzzz["forwarding"] for zzzz in content_list]
comment = [zzzzz["comment"] for zzzzz in content_list]
time = [zzzzzz["time"] for zzzzzz in content_list]
# 微博文字,图片，位置，转发量，评论量，时间,
data = pd.DataFrame({'微博文字': content,'图片url': imgu,'位置': place,'转发量': forwarding,'评论量': comment,'评论时间': time})
data.to_excel("郑州洪水暴雨1.xlsx")

driver.quit()