某音粉丝抓取技术探讨

        本着学习研究之目的,做了一个提取工具,实验了下提取效率挺高,3s左右可以提取到20个fans的信息,一天提取十几W信息不是梦。。。

        目前很多爬虫,无法做到连续爬取一个用户的粉丝信息,爬取完前几页后总是会出现获取不到数据的情况,本工具针对这种情况特意做了优化,实现了连续不间断爬取,主要思路就是通过注册新设备号解决。

上图:

前用的ng+electron,贴一下代码:

<div style="background-color:whitesmoke;width: 100%;height: 100%;">
  <div style="height: 20%;line-height: 50px;text-align: center;background-color: #e0dbd2">
    <label>采集哪个用户的,输入这个用户的UID:</label>
    <input nz-input  [(ngModel)]="uid" style="width:200px;" [disabled]="isRunning"/>
    &nbsp;&nbsp;&nbsp;<label>代理IP:</label>
    <input nz-input  [(ngModel)]="pip" style="width:200px;"/>
    <br/>
    <button nz-button nzType="primary" (click)="startNewCollect(0)" style="margin-left: 15px;" [disabled]="isRunning">全新采集</button>
    <button nz-button nzType="primary" (click)="startLastCollect()" style="margin-left: 15px;" [disabled]="isRunning">继续上次采集</button>
    <button nz-button nzType="danger" (click)="stopCollect()" style="margin-left: 15px;">停止采集</button>
  </div>
  <div style="height: 70%;text-align: center;">
    <nz-spin style="line-height: 200px;" nzSimple *ngIf="isRunning" [nzSize]="'large'" nzTip="自动采集中..."></nz-spin>
    <label>已成功采集到{{count}}个</label>
    <button nz-button nzType="link" (click)="exportExcel()" style="margin-left: 15px;" [disabled]="isRunning">导出到excel中</button>
  </div>
  <div style="position:fixed;bottom: 0px;min-height: 10%;background-color: #e0dbd2;width: 100%;">
    <span style="line-height: 50px;margin-left: 15px;color: red;">
    </span>
  </div>
</div>
import { Component,NgZone } from '@angular/core';
import { WebsqlService } from './service/websql.service';
import { HttpService } from './service/http.service';
import { NzMessageService } from 'ng-zorro-antd/message';
declare var XLSX: any;


@Component({
  selector: 'app-root',
  templateUrl: './app.component.html',
  styleUrls: ['./app.component.css']
})
export class AppComponent {
  token = "xxx";
  money = "";
  uid = "";
  pip = "";
  count = 0;
  more = false;
  cursor = 0;
  isRunning = false;
  constructor(
    private _ngZone: NgZone,
    private websqlService:WebsqlService,
    private http:HttpService,
    private message:NzMessageService){
    this.websqlService.createTB();
    this.getMoney();
    this.getLastData();
  }

  getLastData(){
    const me = this;
    me.websqlService.findLastAll(function(data){
      if(data&& data.rows&&data.rows.length>0){
        me._ngZone.run(()=>{
          me.uid = data.rows.item(0).uid;
          me.count = data.rows.item(0).count;
          me.more = data.rows.item(0).more;
          me.cursor = data.rows.item(0).min;
        });
      }
    });
  }

  getMoney(){
    const me = this;
    me.http.get("/v2/account/query",{token:me.token},
    function(data){
      me.money = data.money;
    },function(msg){});
  }
  /**
   * 全新开始
   * @param cursor 
   */
  startNewCollect(cursor){
    if(this.uid == ""){
      this.message.create("error","请输入UID后再采集...");
      return;
    }
    if(this.pip == ""){
      this.message.create("error","请输入代理IP后再采集...");
      return;
    }
    let param = {
      token:this.token,
      uid:this.uid,
      cursor:cursor,
      ip:this.pip
    };
    this.isRunning = true;
    const me = this;
    me.http.get("/v2/douyin/user/follower",param,
    function(data){
      if(data && data.data && data.data.msg=="No results."){
        me.startNewCollect(0);
      }else{
        data.data.followers.forEach(fans => {
          me.websqlService.insertUserValue(fans.uid,fans.short_id.length>2?fans.short_id:fans.unique_id);
        });
        me.websqlService.insertLastValue(me.uid,Number(me.count)+Number(data.data.followers.length),data.data.has_more,data.data.min_time);
        me.getLastData();
        if(data.data.has_more){
          setTimeout(function(){
            me.startLastCollect();
          }, 1000)
        }else{
          me.isRunning = false;
        }
      }
    },function(msg){
      me.startNewCollect(0);
    });
  }
  /**
   * 继续上一次
   */
  startLastCollect(){
    this.getLastData();
    if(this.uid == ""){
      this.message.create("error","请输入UID后再采集...");
      return;
    }
    if(this.pip == ""){
      this.message.create("error","请输入代理IP后再采集...");
      return;
    }
    
    this.getMoney();
    if(!this.more){return;}
    let param = {
      token:this.token,
      uid:this.uid,
      cursor:this.cursor,
      ip:this.pip
    };
    this.isRunning = true;
    const me = this;
    me.http.get("/v2/douyin/user/follower",param,
    function(data){
      if(data && data.data && data.data.msg=="No results."){
       me.startLastCollect();
      }else{
        data.data.followers.forEach(fans => {
          me.websqlService.insertUserValue(fans.uid,fans.short_id.length>2?fans.short_id:fans.unique_id);
        });
        me.websqlService.insertLastValue(me.uid,Number(me.count)+Number(data.data.followers.length),data.data.has_more,data.data.min_time);
        me.getLastData();
        if(data.data.has_more && me.isRunning){
          setTimeout(function(){
            me.startLastCollect();
          }, 1000)
        }else{
          me.isRunning = false;
        }
      }
    },function(msg){
      me.startLastCollect();
    });
  }
  /**
   * 停止
   */
  stopCollect(){
    this.isRunning = false;
  }

  exportExcel(){
    let list = [];
    const me = this;
    me.websqlService.findUserAll(function(data){
        if(data&& data.rows&&data.rows.length>0){
          for(var i=0;i<data.rows.length;i++){
            var v = {"uid":data.rows.item(i).uid,"did":data.rows.item(i).sid}
            list.push(v);
          }
          me.xlsxExportOrigenalToExcel(list);
        }
      }
    );
  }

  xlsxExportOrigenalToExcel(data){
    let ws = XLSX.utils.json_to_sheet(data);
    let wb = XLSX.utils.book_new();
    XLSX.utils.book_append_sheet(wb, ws, '粉丝');
    var oDay = new Date();
    var tt = oDay.getFullYear()+"年"+(oDay.getMonth()+1)+"月"+oDay.getDate()+"日"+oDay.getHours()+"时"+oDay.getMinutes()+"分"+oDay.getSeconds()+"秒";
    XLSX.writeFile(wb, tt+'导出' + ".xls");
    this.websqlService.delUserRecord();
  }
}

比较完美,主要是中间不会中断,可以在不中断的前提现更换代理IP。

  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值