提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
前言
提示:这里可以添加本文要记录的大概内容:
对webscraper中SelectorElementScroll部分改进,使其先滚动到底部加载全部网页,再读出所有数据。
因Firefox浏览器加载自己写的扩展程序需要审批,因此使用360浏览器加载改写的扩展,QQ浏览器中部分功能不能正常运行,其他浏览器未尝试。
提示:本代码在360浏览器中执行通过
一、对webscraper0.2.0.18的修改
该版本中SelectorElementScroll正好处于script/Selector文件夹下,文件结构较简单,但官方初始版本在执行上就发现有些浏览器不兼容,因此后面对更高版本也做了修改。
var SelectorElementScroll = {
canReturnMultipleRecords: function () {
return true;
},
canHaveChildSelectors: function () {
return true;
},
canHaveLocalChildSelectors: function () {
return true;
},
canCreateNewJobs: function () {
return false;
},
willReturnElements: function () {
return true;
},
scrollToBottom: function() {
window.scrollTo(0,document.documentElement.scrollHeight);
//console.log(document.documentElement.scrollTop+";"+document.documentElement.clientHeight+","+document.documentElement.scrollHeight);
},
scrollToTop: function() {
var step=50;
if(document.documentElement.scrollTop>step){
window.scrollTo(document.documentElement.scrollTop,document.documentElement.scrollTop-step);
}else{
window.scrollTo(document.documentElement.scrollTop,0);
}
},
_getData: function (parentElement) {
var delay = parseInt(this.delay) || 0;
var deferredResponse = $.Deferred();
var foundElements = [];
var lastHeight=0;
var isTouchBottom=false;
// initially scroll down and wait
this.scrollToBottom();
var nextElementSelection = (new Date()).getTime()+delay;
var begin = nextElementSelection;
// infinitely scroll down and find all items
var interval = setInterval(function() {
var now = (new Date()).getTime();
// sleep. wait when to extract next elements
if(now < nextElementSelection) {
clearInterval(interval);
return;
}
/*if(now - begin > 10000){
clearInterval(interval);
alert("now-begin");
return;
}*/
var elements = this.getDataElements(parentElement);
// quick find max bottom
if(!isTouchBottom && lastHeight!=document.documentElement.scrollHeight) {
// continue scrolling and add delay
lastHeight=document.documentElement.scrollHeight;
foundElements = elements;
this.scrollToBottom();
nextElementSelection = now+delay;
//alert('step1;now:='+now+'nextElementSelection ='+nextElementSelection);
// check max bottom
//}else if(!isTouchBottom && lastHeight==document.documentElement.scrollHeight) {
}else if(!isTouchBottom &&now - begin > 10000){
isTouchBottom=true;
foundElements = elements;
this.scrollToTop();
nextElementSelection = now+delay;
//alert('step2;scrollTop='+document.documentElement.scrollTop);
// smooth up to load pic
}else if(isTouchBottom && document.documentElement.scrollTop!=0){
foundElements = elements;
this.scrollToTop();
nextElementSelection = now+delay;
//alert('step3;scrollTop='+document.documentElement.scrollTop);
// check top
}else if(isTouchBottom && document.documentElement.scrollTop==0){
clearInterval(interval);
deferredResponse.resolve(jQuery.makeArray(elements));
//alert('step4;scrollTop='+document.documentElement.scrollTop);
}
}.bind(this), 50);
return deferredResponse.promise();
},
getDataColumns: function () {
return [];
},
getFeatures: function () {
return ['multiple', 'delay']
}
};
二、对0.3.6版本的修改
1.导入
导入src目录,先修改其中的manifest.json文件,完善插件名称、版本号等。
代码如下(示例):
2.修正报错
要点2:
引用模块必须在json中声明类型"type"="module"或在html中添加type=“module”,否则无法识别
Uncaught SyntaxError: Cannot use import statement outside a module
如:
<script type="module" src="/public/portis.js"></script>
或
"background": {
"page": "./background/background.html",
"type": "module"
},
要点3:
导入的模块文件地址前不能省略/(根文件夹)或./(本文件夹)或…/(上层文件夹),须按照规范
Uncaught TypeError: Failed to resolve module specifier "mymodule.js". Relative references must start with either "/",
如:
import * as browser from "../webextension-polyfill/browser-polyfill.min.js";
import Config from '../scripts/Config';
import StorePouchDB from '../scripts/StorePouchDB';
import StoreRestApi from '../scripts/StoreRestApi';
import Sitemap from '../scripts/Sitemap';
import Queue from '../scripts/Queue';
import ChromePopupBrowser from '../scripts/ChromePopupBrowser';
import Scraper from '../scripts/Scraper';
import getBackgroundScript from '../scripts/BackgroundScript';
3.特殊版本
发现GitHub上下载的版本是第三方修改版本,与官网下载版本不符,格式也不对应,修改此版本任务终止。
三、对0.6.4版本的修改
从0.2.1版本后采用ES6语法重写,之后所有版本都在此基础上迭代,因此我修改了最新版本。ES6语法一开始有些难以理解,因此是模仿着改写。
主要修改部分为background_script.js和content_script.js两个文件,注意devtools_panel.js也包含与background.js相同的代码,但修改devtools_panel.js是无效的。
background_script.js的#16095行是SelectorElementScroll部分
function(e, t, i) {
"use strict";
var n = this && this.__awaiter || function(e, t, i, n) {
return new (i || (i = Promise))((function(r, a) {
function o(e) {
try {
c(n.next(e));
} catch (e) {
a(e);
}
}
function s(e) {
try {
c(n.throw(e));
} catch (e) {
a(e);
}
}
function c(e) {
var t;
e.done ? r(e.value) : (t = e.value, t instanceof i ? t : new i((function(e) {
e(t);
}))).then(o, s);
}
c((n = n.apply(e, t || [])).next());
}));
}, r = this && this.__await || function(e) {
return this instanceof r ? (this.v = e, this) : new r(e);
}, a = this && this.__asyncGenerator || function(e, t, i) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var n, a = i.apply(e, t || []), o = [];
return n = {}, s("next"), s("throw"), s("return"), n[Symbol.asyncIterator] = function() {
return this;
}, n;
function s(e) {
a[e] && (n[e] = function(t) {
return new Promise((function(i, n) {
o.push([ e, t, i, n ]) > 1 || c(e, t);
}));
});
}
function c(e, t) {
try {
(i = a[e](t)).value instanceof r ? Promise.resolve(i.value.v).then(l, u) : d(o[0][2], i);
} catch (e) {
d(o[0][3], e);
}
var i;
}
function l(e) {
c("next", e);
}
function u(e) {
c("throw", e);
}
function d(e, t) {
e(t), o.shift(), o.length && c(o[0][0], o[0][1]);
}
};
Object.defineProperty(t, "__esModule", {
value: !0
}), t.SelectorElementScroll = void 0;
const o = i(10);
class s extends o.Selector {
constructor(e) {
super(), this.type = "SelectorElementScroll", this.selector = "", this.multiple = !0,
this.delay = 2e3, this.scrollElementSelector = "", this.updateData(e);
}
canReturnMultipleRecords() {
return !0;
}
canHaveChildSelectors() {
return !0;
}
canCreateNewJobs() {
return !1;
}
willReturnElements() {
return !0;
}
scrollToBottomIm(e) {
return n(this, void 0, void 0, (function*() {
e.element;
this.scrollElementSelector, yield e.scrollDownBodyIm();
}));
}
scrollToBottom(e, t = !1) {
return n(this, void 0, void 0, (function*() {
const i = e.element;
if (this.scrollElementSelector) {
const n = yield e.getElement(this.scrollElementSelector);
if (!n) return;
yield e.scrollDownElement(i, this.selector, t, n.element);
} else yield e.scrollDownBody(i, this.selector, t);
}));
}
scrollToTop(e) {
return n(this, void 0, void 0, (function*() {
e.element;
this.scrollElementSelector, yield e.srcollBodyToTop();
}));
}
_getData(e) {
return a(this, arguments, (function*() {
yield r(this.waitDelay()), yield r(this.scrollToTop(e));
const t = parseInt("" + this.delay, 10) || 0;
let i = yield r(this.getDataElements(e)), n = i.length;
for (;;) {
if (yield r(this.scrollToBottomIm(e)), yield r(e.webPage.waitForPageLoadComplete(!1, t)),
i = yield r(this.getDataElements(e)), i.length === n && (yield r(this.scrollToTop(e)),
yield r(this.scrollToBottom(e, !0)), yield r(e.webPage.waitForPageLoadComplete(!1, t)),
i = yield r(this.getDataElements(e)), i.length === n) && (yield r(this.scrollToTop(e)),
yield r(this.scrollToBottom(e, !0)), yield r(e.webPage.waitForPageLoadComplete(!1, t)),
i = yield r(this.getDataElements(e)), i.length === n)) {
for (const e of i) yield yield r(e);
return yield r(void 0);
}
n = i.length;
}
}));
}
getDataColumns() {
return [];
}
getFeatures() {
return [ "selector", "multiple", "delay", "scrollElementSelector" ];
}
getExperimentalFeatures() {
return [ "scrollElementSelector" ];
}
}
t.SelectorElementScroll = s;
}
注意#50477和#50918行做了函数传递
#50477
scrollDownBody(e, t, i = !1) {
return n(this, void 0, void 0, (function*() {
return this.sendMessage("scrollDownBody", [ e, t, i ]);
}));
}
srcollBodyToTop() {
return n(this, void 0, void 0, (function*() {
return this.sendMessage("srcollBodyToTop");
}));
}
#50918
srcollBodyToTop() {
return n(this, void 0, void 0, (function*() {
yield this.contentScriptClient.srcollBodyToTop();
}));
}
scrollDownBodyIm() {
return n(this, void 0, void 0, (function*() {
yield this.contentScriptClient.scrollDownBodyIm();
}));
}
content-script.js中
#2135
scrollElementToTop(e) {
return r(this, void 0, void 0, (function*() {
const t = this.elementReferences.getElementByReference(e);
yield s.ScrollDown.scrollElementToTop(t);
}));
}
srcollBodyToTop() {
return r(this, void 0, void 0, (function*() {
yield s.ScrollDown.scrollElementToTop(window);
}));
}
scrollElementToBottomIm(e) {
return r(this, void 0, void 0, (function*() {
const t = this.elementReferences.getElementByReference(e);
yield s.ScrollDown.scrollElementToBottomIm(t);
}));
}
scrollDownBodyIm() {
return r(this, void 0, void 0, (function*() {
yield s.ScrollDown.scrollElementToBottomIm(window);
}));
}
#2856
static scrollElementToTop(e) {
return r(this, void 0, void 0, (function*() {
0 !== o.getElementScrollYPosition(e) && (yield o.scrollToY(0, e));
}));
}
static scrollElementToBottomIm(e) {
return r(this, void 0, void 0, (function*() {
document.documentElement.scrollTop !== o.getElementScrollYPosition(e) && (yield o.scrollToY(0, e));
}));
}
总结
通过自己改写浏览器插件大幅提高工作效率,修改该浏览器插件需要补充JS(ES6)方面知识,ES6的难度较高,不太容易上手。