爬微博

import requests
import json
import codecs
import time
import random
import csv
from fake_useragent import UserAgent
import pandas as pd
from selenium import webdriver
from lxml import etree
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType

url_page=’http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2017-07-01:2017-08-01&page={}’

url_2016=[‘http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-01-01:2016-02-01&page={}’, #17139025785
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-02-02:2016-03-01&page={}’, #14556642563
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-03-02:2016-04-01&page={}’, #17326257480
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-04-02:2018-05-01&page={}’, #1003663035@qq.com
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-05-02:2016-06-01&page={}’, # 17139025785
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-06-02:2016-07-01&page={}’, # 14556642563
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-07-02:2016-08-01&page={}’, # 17326257480
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-08-02:2018-09-01&page={}’, # 1003663035@qq.com
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-09-02:2016-10-01&page={}’, # 17139025785
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-10-02:2016-11-01&page={}’, # 14556642563
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-11-02:2016-12-01&page={}’, # 17326257480
http://s.weibo.com/weibo/%25E4%25B8%2580%25E5%25B8%25A6%25E4%25B8%2580%25E8%25B7%25AF&xsort=hot&suball=1&timescope=custom:2016-12-02:2018-12-31&page={}’ # 1003663035@qq.com
]

file1=open(‘ip.txt’,’r’)
ips=file1.readlines()

def get_ip():
ip=ips[random.randint(1,500)]
print(ip)
proxy = Proxy(
{
‘proxyType’: ProxyType.MANUAL,
‘httpProxy’: ip
}
)
return proxy

def get_html(url,s_proxy,user_name,pass_word):
driver = webdriver.Firefox(proxy=s_proxy)
driver.maximize_window()
driver.get(url)
time.sleep(random.randint(3, 9))
driver.find_element_by_xpath(‘//a[@node-type=”loginBtn”]’).click()
time.sleep(random.randint(5, 13))
driver.find_element_by_xpath(‘//input[@node-type=”username”]’).clear()
driver.find_element_by_xpath(‘//input[@node-type=”username”]’).send_keys(user_name) # 获取到用户名标签,发送用户名
time.sleep(random.randint(4, 10))
driver.find_element_by_xpath(‘//input[@node-type=”password”]’).clear()
driver.find_element_by_xpath(‘//input[@node-type=”password”]’).send_keys(pass_word) # 获取到密码标签,发送密码
time.sleep(random.randint(2, 7))
driver.find_element_by_xpath(‘//a[@node-type=”submitBtn”]’).click() # 获取登陆标签并点击
time.sleep(random.randint(11,28))
data=driver.page_source
if data:
parse_detail(data)
page=1
while True:
page+=1
print(‘page is——–’,page)
try:

        driver.find_element_by_xpath('//a[@class="page next S_txt1 S_line1"]').click()  # 获取下一页标签并点击
        driver.execute_script("window.scrollTo(0,400)")
        time.sleep(random.uniform(1.2,3.5))
        driver.execute_script("window.scrollTo(0,2000)")
        time.sleep(random.uniform(1.2, 3.5))
        driver.execute_script("window.scrollTo(0,3000)")
        time.sleep(random.uniform(1.2, 3.5))
        driver.execute_script("window.scrollTo(0,5000)")

        time.sleep(random.uniform(4.1,10.6))
        next_data = driver.page_source
        parse_detail(next_data)
    except:
        break

def parse_detail(parse_data):
soup=bs(parse_data,’lxml’)

divs = soup.find_all('div', class_="WB_cardwrap S_bg2 clearfix")
aa=0
for content in divs:
    aa+=1
    print('第---%d---条信息'%aa)
    p_content=content.find('p',class_="comment_txt")
    amounts=content.find('ul',class_="feed_action_info feed_action_row4")
    if amounts:
        amount=amounts.find_all('li')
        transmit_amount_li=amount[1]          #得到转发的li标签
        transmit_amount_content=transmit_amount_li.find('span',class_="line S_line1").get_text()     #得到‘转发’
        print(transmit_amount_content)

        comment_amount_li=amount[2]           #得到评论的li标签
        comment_amount_content=comment_amount_li.find('span',class_="line S_line1").get_text()      #得到‘评论’
        print(comment_amount_content)

        praise_amount_li=amount[3]            #得到赞的li标签
        praise_amount_content = praise_amount_li.find('span', class_="line S_line1").get_text()   # 得到‘赞’
        praise_amount_content='赞'+praise_amount_content
        print(praise_amount_content)

    if p_content:
        headers = {'User-Agent': UserAgent().random}
        if p_content.find_all('a',class_="WB_text_opt"):
            action_data=p_content.find('a',class_="WB_text_opt")['action-data']
            url = 'http://s.weibo.com/ajax/direct/morethan140?' + action_data + '&_t=0&__rnd=1532056726026'
            response = requests.get(url, headers=headers)
            response.encoding = 'utf-8'
            text1 = json.loads(response.text)
            text = text1['data']['html']
        else:
            text=p_content.get_text().strip()
        print(text)

    writer = csv.writer(file)
    writer.writerow([text,transmit_amount_content,comment_amount_content,praise_amount_content])

if name==’main‘:
for date_url in url_2016:
#切换用户,每个月换一个用户,一共4个用户
if ‘2016-01-01’ in date_url or ‘2016-05-02’ in date_url or ‘2016-09-02’ in date_url:
username=’17139025785’
password=’hn12021’
elif ‘2016-02-02’ in date_url or ‘2016-06-02’ in date_url or ‘2016-10-02’ in date_url:
username=’14556642563’
password=’hn12021’
elif ‘2016-03-02’ in date_url or ‘2016-07-02’ in date_url or ‘2016-11-02’ in date_url:
username=’17326257480’
password=’hn12021’
elif ‘2016-04-02’ in date_url or ‘2016-08-02’ in date_url or ‘2016-12-02’ in date_url:
username=’1003663035@qq.com’
password=’lijiaojiao2010’

    name = '_'.join(date_url.split(':')[2].split('-')[:2])
    file = open(name, 'w', encoding='utf-8-sig')

    proxy=get_ip()
    get_html(date_url, proxy, username, password)
    file.close()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Python微博的过程可以分为以下几个步骤: 1. 确定要取的微博页面。可以通过查看微博源代码来确定要抓取的内容在哪里,比如需要抓取正文或评论等。 2. 引入所需要的Python库。在虫过程中,常用的库有re、requests和beautifulsoup。re模块可以用于匹配字符串,提取所需要的部分。requests是一个用于发起HTTP请求的库,比urllib更加方便。beautifulsoup则提供了一些简单的函数来处理导航、搜索和修改分析树等功能。 3. 对微博页面进行解析和清洗。可以使用re模块对微博文本内容进行简单的清洗操作,比如去除换行符等,以得到干净的结果。 4. 使用requests库发起HTTP请求,获取微博页面的源代码。 5. 使用beautifulsoup库对源代码进行解析,提取所需的内容,如正文或评论等。 6. 对提取的内容进行进一步处理,可以根据需要进行保存、分析或展示。 请注意,取网站的内容需要遵守相关的法律法规和网站的使用协议,避免侵犯他人的权益。建议在进行任何虫活动前,先了解并遵守相关规定。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* [python3--虫--微博虫实战](https://blog.csdn.net/weixin_46863267/article/details/108512962)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] - *2* *3* [通过微博简易上手Python虫](https://blog.csdn.net/weixin_43827628/article/details/113139618)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值