python 爬取微博一页数据

from selenium import webdriver
import time
import pandas as pd

driver = webdriver.Chrome(executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver')
driver.get("http://weibo.com/login.php")
time.sleep(3)
driver.find_element_by_xpath('//*[@id="loginname"]').clear()
driver.find_element_by_xpath('//*[@id="loginname"]').send_keys('微博账号')
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input').clear()
time.sleep(1)
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input').send_keys('微博密码')
time.sleep(1)
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click()
# 20秒手机扫二维码登陆
time.sleep(20)

content_list = []

driver.get("https://m.weibo.cn/search?disable_history=1&disable_hot=0&extparam=8008641010000000000_0_0&luicode=10000011&lfid=23103600258008641010000000000&containerid=100103type%3D557%26t%3D10%26q%3D%E6%9A%B4%E9%9B%A8%E6%B4%AA%E6%B0%B4")
time.sleep(7)

js = "return action=document.body.scrollHeight"
# 初始化现在滚动条所在高度为0
height = 0
# 当前窗口总高度
new_height = driver.execute_script(js)
# 循环开始
while height < new_height:
    # 将滚动条调整至页面底部
    for i in range(height, new_height, 100):
        # print(i)
        driver.execute_script('window.scrollTo(0, {})'.format(i))
        time.sleep(0.2)
    height = new_height
    time.sleep(0.1)
    new_height = driver.execute_script(js)

time.sleep(10)
li_lis = driver.find_elements_by_xpath("//*[@class='weibo-text']")
time.sleep(1)
print(len(li_lis))
iii = 0
while iii < len(li_lis):
    print(iii)
    driver.get("https://m.weibo.cn/search?disable_history=1&disable_hot=0&extparam=8008641010000000000_0_0&luicode=10000011&lfid=23103600258008641010000000000&containerid=100103type%3D557%26t%3D10%26q%3D%E6%9A%B4%E9%9B%A8%E6%B4%AA%E6%B0%B4")
    time.sleep(5)

    js = "return action=document.body.scrollHeight"
    # 初始化现在滚动条所在高度为0
    height = 0
    # 当前窗口总高度
    new_height = driver.execute_script(js)
    # 循环开始
    while height < new_height:
        # 将滚动条调整至页面底部
        for i in range(height, new_height, 100):
            # print(i)
            driver.execute_script('window.scrollTo(0, {})'.format(i))
            time.sleep(0.2)
        height = new_height
        time.sleep(0.1)
        new_height = driver.execute_script(js)

    time.sleep(10)
    li_list = driver.find_elements_by_xpath("//*[@class='weibo-text']")
    time.sleep(6)
    li_lists = [x for x in li_list]
    # print(li_lists[i].text)
    time.sleep(10)
    try:
        # print(li_lists[iii].text)  暴雨洪水造成的损害,依然可以明显看到。但郑州,越发坚强和美丽。 郑州·郑州市紫荆山公园
        li_lists[iii].click()
    except Exception as e:
        print(e)
    finally:
        pass
    time.sleep(3)
    item = {}
    try:
        item["content"] = driver.find_element_by_class_name("weibo-text").text
        time.sleep(1)
        img_url = driver.find_elements_by_xpath("//*[@class='m-auto-list']/li")
        imgurls = ""
        for i in img_url:
            imgs = i.find_element_by_tag_name("img").get_attribute("src")
            imgurls += imgs + ";"
        item["imgurl"] = imgurls
        time.sleep(1)
        place = driver.find_elements_by_xpath("//*[@class='url-icon']/following-sibling::span[1]")
        places = [q for q in place if "郑" in q.text]
        item["place"] = places[0].text
        time.sleep(1)
        item["forwarding"] = driver.find_element_by_class_name("tab-item").text
        time.sleep(1)
        item["comment"] = driver.find_element_by_xpath("//*[@class='lite-page-tab']/div[2]").text
        time.sleep(1)
        timess = driver.find_element_by_class_name("time").text
        if timess[0] == "6":
            break
        item["time"] = driver.find_element_by_class_name("time").text
        time.sleep(1)
        print(item)
        content_list.append(item)
    except Exception as e:
        print(e)
    finally:
        iii += 1


# 开始写入数据到excel中
time.sleep(10)
content = [z["content"] for z in content_list]
imgu = [zz["imgurl"] for zz in content_list]
place = [zzz["place"] for zzz in content_list]
forwarding = [zzzz["forwarding"] for zzzz in content_list]
comment = [zzzzz["comment"] for zzzzz in content_list]
time = [zzzzzz["time"] for zzzzzz in content_list]
# 微博文字,图片,位置,转发量,评论量,时间,
data = pd.DataFrame({'微博文字': content,'图片url': imgu,'位置': place,'转发量': forwarding,'评论量': comment,'评论时间': time})
data.to_excel("郑州洪水暴雨1.xlsx")

driver.quit()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值