微博关键词搜索并爬取前40页内容与图片_按关键字爬取微博文章和图片-CSDN博客

本文链接：https://blog.csdn.net/weixin_41357912/article/details/107118509

微博关键词搜索并爬取前40页内容与图片

# -*- coding: utf-8 -*-
"""
@author: tanderick
"""
import requests
import re 
import os
import urllib.parse
import time

#header文件     
headers ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0)'}
#搜索名词
keyword = '简历'
#创建同名文件夹
filepath = str(r'C:/weibo/'+keyword)
if not os.path.exists(filepath):
    os.mkdir(filepath)
#搜索名词下40页源码并保存为txt文件
kw=urllib.parse.quote(keyword)
s_url ='https://s.weibo.com/weibo?q='+kw+'&wvr=6&b=1&Refer=SWeibo_box'
f = requests.get(s_url,headers = headers)
for i in range(40):
  html = requests.get(s_url+'&page='+str(i),headers = headers)
  html = html.text
  html =urllib.parse.unquote(html)  
  print(i)  
  with open(filepath+'/'+keyword+'.txt','a',encoding ="utf-8") as f:
     f.write(html)
  time.sleep(0.5)
#打开该文件  
with open(filepath+'/'+keyword+'.txt','r',encoding ="utf-8") as h:
     html = h.read()
#解析内容并下载          
uids = re.findall('<a href="//weibo.com/(.*?)?refer_flag=1001030103_" class=".*?" target=".*?" nick-name="(.*?)" suda-data=".*?">.*?</a>',html)
contents = re.findall(' <p class="txt" node-type="feed_list_content" nick-name=".*?">(.*?)</p>',html,re.S)
pic_id = re.findall('<!--card-wrap-->(.*?)<!--/card-wrap-->',html,re.S)
for i in range(len(uids)):
    uid,nickname = uids[i]
    out_filepath =filepath+'/'+nickname
    if not os.path.exists(out_filepath):
        os.mkdir(out_filepath)
    with open(out_filepath+'/微博内容.txt','a',encoding ="utf-8") as f:
        f.write(str(uids[i])+'\r\n'+re.sub('<.*?>','',contents[i],re.S))
    #获取用户名与微博内容    
    pic_urls1 = re.findall('img src="(.*?)jpg".*?',pic_id[i])
    pic_urls2 = re.findall('cover_img=(.*?)jpg.*?',pic_id[i])
    for url1 in pic_urls1:
        url1 = re.sub(r'https:','',str(url1))    
        filename = url1.split('/')[-1]
        response = requests.get(r'http:'+url1+'jpg',headers=headers)
        with open(out_filepath+'/'+filename+'jpg','wb') as f:
          f.write(response.content)
        print(r'http:'+url1+'jpg'+'下载完成')
    for url2 in pic_urls2:         
        url2 = re.sub(r'https:','',str(url2))
        filename = url1.split('/')[-1]
        response = requests.get(r'http:'+url2+'jpg',headers=headers)
        with open(out_filepath+'/'+filename+'jpg','wb') as f:
          f.write(response.content)
        print(r'http:'+url2+'jpg'+'下载完成')
    #下载图片