本着学习研究之目的,做了一个提取工具,实验了下提取效率挺高,3s左右可以提取到20个fans的信息,一天提取十几W信息不是梦。。。
目前很多爬虫,无法做到连续爬取一个用户的粉丝信息,爬取完前几页后总是会出现获取不到数据的情况,本工具针对这种情况特意做了优化,实现了连续不间断爬取,主要思路就是通过注册新设备号解决。
上图:
前用的ng+electron,贴一下代码:
<div style="background-color:whitesmoke;width: 100%;height: 100%;">
<div style="height: 20%;line-height: 50px;text-align: center;background-color: #e0dbd2">
<label>采集哪个用户的,输入这个用户的UID:</label>
<input nz-input [(ngModel)]="uid" style="width:200px;" [disabled]="isRunning"/>
<label>代理IP:</label>
<input nz-input [(ngModel)]="pip" style="width:200px;"/>
<br/>
<button nz-button nzType="primary" (click)="startNewCollect(0)" style="margin-left: 15px;" [disabled]="isRunning">全新采集</button>
<button nz-button nzType="primary" (click)="startLastCollect()" style="margin-left: 15px;" [disabled]="isRunning">继续上次采集</button>
<button nz-button nzType="danger" (click)="stopCollect()" style="margin-left: 15px;">停止采集</button>
</div>
<div style="height: 70%;text-align: center;">
<nz-spin style="line-height: 200px;" nzSimple *ngIf="isRunning" [nzSize]="'large'" nzTip="自动采集中..."></nz-spin>
<label>已成功采集到{{count}}个</label>
<button nz-button nzType="link" (click)="exportExcel()" style="margin-left: 15px;" [disabled]="isRunning">导出到excel中</button>
</div>
<div style="position:fixed;bottom: 0px;min-height: 10%;background-color: #e0dbd2;width: 100%;">
<span style="line-height: 50px;margin-left: 15px;color: red;">
</span>
</div>
</div>
import { Component,NgZone } from '@angular/core';
import { WebsqlService } from './service/websql.service';
import { HttpService } from './service/http.service';
import { NzMessageService } from 'ng-zorro-antd/message';
declare var XLSX: any;
@Component({
selector: 'app-root',
templateUrl: './app.component.html',
styleUrls: ['./app.component.css']
})
export class AppComponent {
token = "xxx";
money = "";
uid = "";
pip = "";
count = 0;
more = false;
cursor = 0;
isRunning = false;
constructor(
private _ngZone: NgZone,
private websqlService:WebsqlService,
private http:HttpService,
private message:NzMessageService){
this.websqlService.createTB();
this.getMoney();
this.getLastData();
}
getLastData(){
const me = this;
me.websqlService.findLastAll(function(data){
if(data&& data.rows&&data.rows.length>0){
me._ngZone.run(()=>{
me.uid = data.rows.item(0).uid;
me.count = data.rows.item(0).count;
me.more = data.rows.item(0).more;
me.cursor = data.rows.item(0).min;
});
}
});
}
getMoney(){
const me = this;
me.http.get("/v2/account/query",{token:me.token},
function(data){
me.money = data.money;
},function(msg){});
}
/**
* 全新开始
* @param cursor
*/
startNewCollect(cursor){
if(this.uid == ""){
this.message.create("error","请输入UID后再采集...");
return;
}
if(this.pip == ""){
this.message.create("error","请输入代理IP后再采集...");
return;
}
let param = {
token:this.token,
uid:this.uid,
cursor:cursor,
ip:this.pip
};
this.isRunning = true;
const me = this;
me.http.get("/v2/douyin/user/follower",param,
function(data){
if(data && data.data && data.data.msg=="No results."){
me.startNewCollect(0);
}else{
data.data.followers.forEach(fans => {
me.websqlService.insertUserValue(fans.uid,fans.short_id.length>2?fans.short_id:fans.unique_id);
});
me.websqlService.insertLastValue(me.uid,Number(me.count)+Number(data.data.followers.length),data.data.has_more,data.data.min_time);
me.getLastData();
if(data.data.has_more){
setTimeout(function(){
me.startLastCollect();
}, 1000)
}else{
me.isRunning = false;
}
}
},function(msg){
me.startNewCollect(0);
});
}
/**
* 继续上一次
*/
startLastCollect(){
this.getLastData();
if(this.uid == ""){
this.message.create("error","请输入UID后再采集...");
return;
}
if(this.pip == ""){
this.message.create("error","请输入代理IP后再采集...");
return;
}
this.getMoney();
if(!this.more){return;}
let param = {
token:this.token,
uid:this.uid,
cursor:this.cursor,
ip:this.pip
};
this.isRunning = true;
const me = this;
me.http.get("/v2/douyin/user/follower",param,
function(data){
if(data && data.data && data.data.msg=="No results."){
me.startLastCollect();
}else{
data.data.followers.forEach(fans => {
me.websqlService.insertUserValue(fans.uid,fans.short_id.length>2?fans.short_id:fans.unique_id);
});
me.websqlService.insertLastValue(me.uid,Number(me.count)+Number(data.data.followers.length),data.data.has_more,data.data.min_time);
me.getLastData();
if(data.data.has_more && me.isRunning){
setTimeout(function(){
me.startLastCollect();
}, 1000)
}else{
me.isRunning = false;
}
}
},function(msg){
me.startLastCollect();
});
}
/**
* 停止
*/
stopCollect(){
this.isRunning = false;
}
exportExcel(){
let list = [];
const me = this;
me.websqlService.findUserAll(function(data){
if(data&& data.rows&&data.rows.length>0){
for(var i=0;i<data.rows.length;i++){
var v = {"uid":data.rows.item(i).uid,"did":data.rows.item(i).sid}
list.push(v);
}
me.xlsxExportOrigenalToExcel(list);
}
}
);
}
xlsxExportOrigenalToExcel(data){
let ws = XLSX.utils.json_to_sheet(data);
let wb = XLSX.utils.book_new();
XLSX.utils.book_append_sheet(wb, ws, '粉丝');
var oDay = new Date();
var tt = oDay.getFullYear()+"年"+(oDay.getMonth()+1)+"月"+oDay.getDate()+"日"+oDay.getHours()+"时"+oDay.getMinutes()+"分"+oDay.getSeconds()+"秒";
XLSX.writeFile(wb, tt+'导出' + ".xls");
this.websqlService.delUserRecord();
}
}
比较完美,主要是中间不会中断,可以在不中断的前提现更换代理IP。