微博爬虫(python)历险记

用python爬微博某明星发布的文章和图片遇到的那些坑
初始代码

#-*-coding:utf8-*-

import re
import string
import sys
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
import requests
from lxml import etree

reload(sys) 
sys.setdefaultencoding('utf-8')
if(len(sys.argv) >=2):
    user_id = (int)(sys.argv[1])
else:
    user_id = (int)(raw_input(u"请输入user_id: "))

cookie = {"Cookie": "#your cookie"}
url = 'http://weibo.cn/u/%d?filter=1&page=1'%user_id

html = requests.get(url, cookies = cookie).content
selector = etree.HTML(html)
pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value'])

result = "" 
urllist_set = set()
word_count = 1
image_count = 1

print u'爬虫准备就绪...'

for page in range(1,pageNum+1):

  #获取lxml页面
  url = 'http://weibo.cn/u/%d?filter=1&page=%d'%(user_id,page) 
  lxml = requests.get(url, cookies = cookie).content

  #文字爬取
  selector = etree.HTML(lxml)
  content = selector.xpath('//span[@class="ctt"]')
  for each in content:
    text = each.xpath('string(.)')
    if word_count >= 4:
      text = "%d :"%(word_count-3) +text+"\n\n"
    else :
      text = text+"\n\n"
    result = result + text
    word_count += 1

  #图片爬取
  soup = BeautifulSoup(lxml, "lxml")
  urllist = soup.find_all('a',href=re.compile(r'^http://weibo.cn/mblog/oripic',re.I))
  first = 0
  for imgurl in urllist:
    urllist_set.add(requests.get(imgurl['href'], cookies = cookie).url)
    image_count +=1

fo = open("/Users/Personals/%s"%user_id, "wb")
fo.write(result)
word_path=os.getcwd()+'/%d'%user_id
print u'文字微博爬取完毕'

link = ""
fo2 = open("/Users/Personals/%s_imageurls"%user_id, "wb")
for eachlink in urllist_set:
  link = link + eachlink +"\n"
fo2.write(link)
print u'图片链接爬取完毕'

if not urllist_set:
  print u'该页面中不存在图片'
else:
  #下载图片,保存在当前目录的pythonimg文件夹下
  image_path=os.getcwd()+'/weibo_image'
  if os.path.exists(image_path) is False:
    os.mkdir(image_path)
  x=1
  for imgurl in urllist_set:
    temp= image_path + '/%s.jpg' % x
    print u'正在下载第%s张图片' % x
    try:
      urllib.urlretrieve(urllib2.urlopen(imgurl).geturl(),temp)
    except:
      print u"该图片下载失败:%s"%imgurl
    x+=1

print u'原创微博爬取完毕,共%d条,保存路径%s'%(word_count-4,word_path)
print u'微博图片爬取完毕,共%d张,保存路径%s'%(image_count-1,image_path)

使用 python 3.7 pip10
参数 cookies 自己登陆微博后取出 user_id 某明星微博url连接uid

  1. pip 版本
    异常报错:You are using pip version xxx, however version xxx is available.
    You should consider upgrading via the ‘pip install --upgrade pip’ command.
    原因解释:pip 不是最新的,需要更新 10>18,可以尝试’pip install --upgrade pip’命令
    解决办法:‘pip install --upgrade pip’ 命令更新

  2. etree
    异常报错:
    原因解释:python3之后 lxml包中的etree不存在
    解决办法: import lxml.html
    etree = lxml.html.etree

  3. sys
    异常报错:module ‘sys’ has no attribute ‘setdefaultencoding’
    原因分析:Python3字符串默认编码unicode, 所以sys.setdefaultencoding也不存在了
    解决办法:去掉sys.setdefaultencoding

  4. raw_input
    异常报错:name ‘raw_input’ is not defined
    原因解释:python3.x系列不再有 raw_input 函数
    解决办法:raw_input换成input(等效)

  5. fo = open("/xxxx/%s"%user_id, “wb”)
    异常报错:UnicodeEncodeError: ‘gbk’ codec can’t encode character ‘\xa0’ in position 2: illegal multibyte sequence
    原因解释:在windows系统里,新文件的默认编码是gbk,但是python文件本身是utf-8 ,使用该编码打开windows下的新文件会报错
    解决办法:fo = open("/xxxx/"%user_id, “wb”,encoding=“utf-8”)

  6. 文件查不到
    异常报错:No such file or directory: ‘/xxxx/user_id’
    原因解释:没有文件夹
    解决办法:相对路径去掉前面/,绝对路径外加r

  7. soup.find_all(‘a’,href=re.compile(r’^http://weibo.cn/mblog/oripic’,re.I))
    问题:该页面中不存在图片
    原因:查找的标签应该选择img
    解决:改成soup.find_all(‘img’)就可以爬到

  8. 下载图片报错
    报错信息:python AttributeError(“module ‘urllib’ has no attribute ‘urlretrieve’”)
    原因解释:python2 与python3的urllib不同在与python3要加上.request
    解决方法:urllib.request.urlretrieve(url, temp, Schedule)

python运用的函数有些参数显示红色,但是不影响运行,目前还不知道为什么(是否和开发工具有关,小弟用的idea)

处理完之后代码

# coding:utf-8
import urllib.request
import re
import importlib
import sys
import os
import urllib
from bs4 import BeautifulSoup
import requests
import lxml.html

etree = lxml.html.etree

def pachong(c):
    # importlib.reload(sys)
    # # sys.setdefaultencoding('utf-8')
    # if (len(sys.argv) >= 2):
    #     user_id = (int)(sys.argv[1])
    # else:
    user_id = (int)(input(u"请输入user_id: "))

    cookie = {"Cookie": c}
    url = 'http://weibo.cn/u/%d?filter=1&page=1' % user_id
    print("url: ", url)
    r = requests.get(url, cookies=cookie)
    html = r.content
    print("html: ", html)
    selector = etree.HTML(html)
    pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value'])

    result = ""
    urllist_set = set()
    word_count = 1
    image_count = 1

    print(u'爬虫准备就绪...')

    for page in range(1, pageNum + 1):

        # 获取lxml页面
        url = 'http://weibo.cn/u/%d?filter=1&page=%d' % (user_id, page)
        lxml = requests.get(url, cookies=cookie).content

        # 文字爬取
        selector = etree.HTML(lxml)
        content = selector.xpath('//span[@class="ctt"]')
        for each in content:
            text = each.xpath('string(.)')
            if word_count >= 4:
                text = "%d :" % (word_count - 3) + text + "\n\n"
            else:
                text = text + "\n\n"
            result = result + text
            word_count += 1

        # 图片爬取
        soup = BeautifulSoup(lxml, "lxml")
        urllist = soup.find_all('img')
        first = 0
        for imgurl in urllist:
            urllist_set.add(requests.get(imgurl['src'], cookies=cookie).url)
            image_count += 1

    fo = open("text/%s_title.txt" % user_id, "w", encoding="utf-8")
    fo.write(result)
    word_path = os.getcwd() + '/%d' % user_id
    print(u'文字微博爬取完毕')

    link = ""
    fo2 = open("text/%s_imageurls.txt" % user_id, "w", encoding="utf-8")
    for eachlink in urllist_set:
        link = link + eachlink + "\n"
    fo2.write(link)
    print(u'图片链接爬取完毕')

    if not urllist_set:
        print(u'该页面中不存在图片')
    else:
        # 下载图片,保存在当前目录的pythonimg文件夹下
        image_path = os.getcwd() + '/weibo_image'
        if os.path.exists(image_path) is False:
            os.mkdir(image_path)
        x = 1
        for imgurl in urllist_set:
            temp = image_path + '/%s.jpg' % x
            print(u'正在下载第%s张图片' % x)
            try:
                url = urllib.request.urlopen(imgurl).geturl();
                urllib.request.urlretrieve(url, temp, Schedule)
            except Exception as e:
                print(u"该图片下载失败:%s" % imgurl)
                print("e: ",repr(e));
            x += 1

    print('原创微博爬取完毕,共%d条,保存路径%s' % (word_count - 4, word_path))
    print('微博图片爬取完毕,共%d张,保存路径%s' % (image_count - 1, image_path))


def Schedule(a, b, c):
    # a:已经下载的数据块
    # b:数据块的大小
    # c:远程文件的大小
    per = 100.0 * a * b / c
    if per > 100:
        per = 100
    print('%.2f%%' % per)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值