首先,发表宇宙安全声明:本人是初学者,只是跟着教程做实例,爬取信息量很小(仅有两页),且不会用于商业用途,对网站无任何恶意,只是希望了解现象后的原理和解决方案,所以请尽量避免所问问题之外的讨论!
好的,声明完成,下面进入正题:
题主在mooc学爬虫,BIT的课程中有一个爬取淘宝商品价格的实例,跟着做了,但是遇到一些问题。
BIT的这个课程应该比较旧,里面淘宝搜索界面的HTML文件和现在已经完全不同了,那个HTML文件里面是有每个商品的简要信息的,但是现在进入淘宝,右击-查看源代码,是这样的:(出于方便大佬解答,复制了一整个页面的码,可能影响阅读体验,请见谅 😁)
<!DOCTYPE html><html><head><script>window.__alp_abtest_bucket__='{"shunt":"4180_5_1"}'</script><script src="//g.alicdn.com/mtb/wpk/1.3.1/wpk.js"></script><script>try{getLego2WPK({"bid":"alpvis","env":"","rate":0.1})}catch(e){window.getLego2WPK=function(){return {report:noop,reportFlow:noop,reportError:noop,reportApiError:noop,setConfig:noop,install:noop,installAll:noop,uninstall:noop,diagnose:noop};function noop(){}}}</script><meta name="spm-id" content="a2e1u"><title>爱淘宝PC搜索</title><meta charset="utf-8"><meta name="aplus-waiting" content="MAN"><link rel="stylesheet" type="text/css" href="//g.alicdn.com/thx/cube/1.1.0/cube-min.css"><link rel="stylesheet" type="text/css" href="//lego.alicdn.com/mm/lego2??cell-pc-frame/0.0.14/index.css,pc-keyword-list/0.0.2/index.css,pc-search-list/0.0.6/index.css,pc-search/0.0.15/index.css"><script>window._cellPageData_ = {
mtop: {
trackConfig: {
"mtop.alimama.union.xt.en.api.entry": "variableMap.union_lens,variableMap.recoveryId:getRecoveryId()"
}
}
}</script><script src="https://g.alicdn.com/??cell/lib-zepto/0.0.2/index.js,cell/lib-rem/0.0.5/index.js,cell/lib-seajs/0.0.5/index.js,cell/lib-magix/0.0.17/index.js,cell/lib-util/0.2.11/index.js,mtb/lib-windvane/2.1.8/windvane.js,mtb/lib-env/1.9.10/env.js,mtb/lib-login/1.5.10/login.js,mtb/lib-httpurl/1.3.2/httpurl.js,mtb/lib-promise/3.1.3/polyfillB.js,mtb/lib-mtop/2.5.1/mtop.js,cell/lib-pagestate/0.0.3/index.js,cell/lib-mtop/1.0.4/index.js,cell/lib-mtop/1.0.4/errorlog.js,cell/lib-mtop/1.0.4/taishan.js,cell/lib-mtop/1.0.4/link.js,cell/lib-mtop/1.0.4/datainject.js,cell/lib-mtop/1.0.4/optimus.js,dt/tracker/3.3.0/tracker.Tracker.js,dt/tracker/3.3.0/tracker.performanceTrackerPlugin.js,cell/lib-track/0.1.0/index.js,cell/cellex-baseItem/0.0.3/seajs/index.js,cell/cellex-etao-lazyload/0.0.7/index.js,cell/cellex-etao-lazyload/0.0.7/lazyInit.js,cell/lib-img/0.0.7/index.js"></script><style>body,
button,
input,
select,
textarea {
font-family: arial, Helvetica, sans-serif !important;
}</style></head><body style="background-color:#fff;" data-spm="19484427"><script>window.pvid="201_33.53.204.179_142360_1689422516899";window.bucket_info="1";with(document)with(body)with(insertBefore(createElement("script"),firstChild))setAttribute("exparams","category=&userid=&aplus&pvid=201_33.53.204.179_142360_1689422516899&bucket_info=1&req_url=" + encodeURIComponent(location.href) + (typeof __alp_abtest_bucket__ === "string" ? "&lens_extinfo=" + encodeURIComponent(__alp_abtest_bucket__) : ""),id="tb-beacon-aplus",src="//g.alicdn.com/alilog/mlog/aplus_v2.js")</script><!-- lego2 import from /1/alp/atb/block_tb_head.html --><script src="//g.alicdn.com/??kissy/k/1.3.1/kissy-min.js"></script>
<link rel="stylesheet" href="//g.alicdn.com/tb/global/3.4.17/global-min.css?t=20150525">
<script src="//g.alicdn.com/tb/global/3.4.17/global-min.js?t=20140417"></script>
<div id="J_SiteNav" class="site-nav">
<div id="J_SiteNavBd" class="site-nav-bd">
<ul id="J_SiteNavBdL" class="site-nav-bd-l"></ul>
<ul id="J_SiteNavBdR" class="site-nav-bd-r"></ul>
</div>
</div>
<script>
if (window.TB) {
TB.Global.blacklist = ['fn-cart'];
TB.Global.init();
}
var openNianhuo = false;
</script>
<div id="J_u_root"><div bx-container="true" class="main-container" data-spm="d_frame" mx-view="mv/frame"><div mx-view="pc-search" class="pc-search" data-config="","url":"https://ai.taobao.com/search/index.htm?pid=mm_0_0_0&key=%E4%B9%90%E9%AB%98","isRed":false},{"text":"养生壶","url":"https://ai.taobao.com/search/index.htm?pid=mm_0_0_0&key=%E5%85%BB%E7%94%9F%E5%A3%B6","isRed":false}],"ad":{"img":"https://ossgw.alicdn.com/alp/1599430879914-227-79.png","link":"javascript:void(0)"}}" bx-name="pc-search" bx-version="0.0.15" bx-guid="lego1" hasjs="true" hascss="true" bx-slot="slot-1" data-spm="29996459"><img src="https://ossgw.alicdn.com/alp/1572809375526-200-76.png"><div class="search"><div class="search-border clearfix"><form mx-submit="submit()" id="J_searchForm" class="ks-combobox ks-combobox-shown"><span class="text-wrap"><input type="text" class="text" id="J_search_key" name="key" accesskey="s" autocomplete="off" x-webkit-speech x-webkit-grammar="builtin:translate" tabindex="0" aria-expanded="true" hidefocus="true"></span><input mx-click="submit()" type="submit" class="submit" value="搜索" index="1"></form><div class="sugCon"><div id="J_sugCon"></div><div id="J_sugGroup" class="sugTagGroup"></div></div></div><div class="search-links" id="J_searchLink" trace="hotsearch" data-spm="1998559106"></div></div><a target="_blank" class="ad" href="javascript:void(0)"><img src="https://ossgw.alicdn.com/alp/1599430879914-227-79.png"></a></div><script type="text/tmpl" id="searchLinkTmpl"><% if(firstKey&&firstKey.url){%>
<a
href="<%= firstKey.url%>"
class="<% if(firstKey.isRed){%>red<%}%>"
target="_blank"
><%= firstKey.text%></a>
<% }%>
<% for(var i = 0; i < keyWords.length; i++){%>
<a
href="<%= keyWords[i].url%>"
class="<% if(keyWords[i].isRed){%>red<%}%>"
target="_blank"
><%= keyWords[i].text%></a>
<% }%></script><script type="text/tmpl" id="sugListTmpl"><ul class="sug-list">
<% for(var i = 0, j = list.length; i < j; i++){%>
<li id="J_sug_item_<%=i%>" class="sug-item" mx-mouseover="sugOver({index: <%=i%>, hasGroup: <%=list[i].hasGroup%>})" mx-click="sugSearch({key:'<%= list[i].q%>'})">
<div class="sug-item-title">
<%!list[i].name%>
<%if(list[i].hasGroup){%>
<span class="atbfont">󰅭</span>
<%}%>
</div>
</li>
<% }%>
</ul></script><script type="text/tmpl" id="sugGroupTmpl"><% for(var i = 0, j = groupList.length; i < j; i++){%>
<div id="J_tag_group_<%=groupList[i].index-1%>" class="search-tags-group">
<h1 class="right-title"><%!groupList[i].name%></h1>
<% for(var k = 0, l = groupList[i].data.length; k < l; k++){ %>
<ul class="search-tags-list clearfix">
<% for(var m = 0, n = groupList[i].data[k].length; m < n; m++){ %>
<li>
<a href="#" class="<%if(groupList[i].data[k][m].type=='hot'){%>hot<%}%>" mx-click="sugGroupSearch({key:'<%= groupList[i].q+ ' ' + groupList[i].data[k][m].title%>'})"><%=groupList[i].data[k][m].title%></a>
</li>
<%}%>
</ul>
<%}%>
</div>
<% }%></script><div class="lego-pc-search-list pc-search-list" mx-view="pc-search-list" data-config="{"searchKey":"key","defaultKey":"连衣裙","pSize":"60","floorId":"qie,34374,爱淘宝PC切流默认排序","mainTagImg":"https://gw.alicdn.com/tfs/TB1caflgebviK0jSZFNXXaApXXa-32-16.png","mainTagAuction":"281666","tags":[{"auction":"3394","img":"https://gw.alicdn.com/tfs/TB1D95ThFP7gK0jSZFjXXc5aXXa-95-36.png"},{"auction":"513","img":"https://img.alicdn.com/tfs/TB1B0PgkWL7gK0jSZFBXXXZZpXa-117-36.png"},{"auction":"71682","img":"https://gw.alicdn.com/tfs/TB1R6hSg9f2gK0jSZFPXXXsopXa-50-16.png"},{"auction":"4738","img":"https://gw.alicdn.com/tfs/TB1.EpVg4D1gK0jSZFKXXcJrVXa-16-16.png"},{"auction":"79810","img":"https://gw.alicdn.com/tfs/TB1R24Wg7T2gK0jSZFkXXcIQFXa-32-16.png"}],"maxTagNum":"2","needNavigator":true,"isNeedSort":true,"isNeedUserType":true,"aaa":1}" bx-name="pc-search-list" bx-version="0.0.6" bx-guid="lego2" hasjs="true" hascss="true" bx-slot="slot-2" data-spm="29996460"></div><div class="lego-pc-keyword-list pc-keyword-list" mx-view="pc-keyword-list" data-config="{"searchKey":"key"}" bx-name="pc-keyword-list" bx-version="0.0.2" bx-guid="lego3" hasjs="true" hascss="true" bx-slot="slot-3" data-spm="29996461"></div></div><style>#alimama-footer {
background: #fff;
}
.alimama-info,
.aligroup-info,
.ualificate-info {
max-width: 1200px;
margin: 0 auto;
}</style><!-- lego2 import from /1/alp/union/mm/footer/taobao.html --><div id="alimama-footer" data-spm="a214tr9">
<div class="alimama-info">
<div class="link">
<a target="_black" href="//help.alimama.com/">联系客服</a>
<b>|</b>
<a target="_black" href="//open.taobao.com/">开放平台</a>
<b>|</b>
<a target="_black" href="https://terms.alicdn.com/legal-agreement/terms/suit_bu1_ali_mama_division/suit_bu1_ali_mama_division201709111812_13128.html">法律声明</a>
<b>|</b>
<a target="_black" href="//jubao.alibaba.com">廉正举报</a>
</div>
<div class="copyright">
<span>Taobao.com版权所有 2003-现在</span>
<b>|</b>
<a target="_black" href="http://beian.miit.gov.cn">增值电信业务经营许可证:浙B2-20070195</a>
<a rel="noopener noreferrer" target="_blank" href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=33010002000075"><img src="//gw.alicdn.com/tfs/TB1jwakrbH1gK0jSZFwXXc7aXXa-20-20.png" style="vertical-align: -2px;margin-right: 4px;">浙公网安备 33010002000075号</a>
</div>
</div>
<div class="aligroup-info">
<a target="_black" href="http://www.alibabagroup.com/cn/global/home">阿里巴巴集团
<div class="ualificate-info">
<a target="_black" href="http://www.zjjubao.com/?spm=a21bo.jianhua.1997523009.44.5af911d9Vb95w7">
<img align="absmiddle" src="//img.alicdn.com/tps/i2/T1C3z7FudfXXcsE9Te-40-42.png" alt="网监局">
</a>
<a target="_black" href="http://idinfo.zjamr.zj.gov.cn/bscx.do?method=lzxx&id=3301843301002003002431">
<img align="absmiddle" width="40" src="//gw.alicdn.com/tfs/TB1HxCbreL2gK0jSZPhXXahvXXa-65-70.gif">
</a>
<a target="_black" href="https://www.12315.cn/?spm=a21bo.jianhua.1997523009.43.5af911d9Vb95w7">
<img align="absmiddle" alt="网监局" src="//img.alicdn.com/tps/i4/T1VVv9FABeXXbtCInf-38-42.png">
</a>
</div>
</div>
<style>
.index-nav-map-wrap {
height: auto !important;
}
#alimama-footer {
padding: 30px 10px;
text-align: left;
font-size: 12px;
line-height: 2.4;
font-family: 'helvetica neue',tahoma,'hiragino sans gb',stheiti,'wenquanyi micro hei',\\5FAE\\8F6F\\96C5\\9ED1,\\5B8B\\4F53,sans-serif;
}
#alimama-footer a,
#alimama-footer span {
color: #333;
filter:alpha(opacity=50);
filter:none\9;
opacity: 0.5;
}
#alimama-footer a:hover {
filter:alpha(opacity=100);
filter:none\9;
opacity: 1;
}
#alimama-footer .alimama-info .link,
#alimama-footer .alimama-info .copyright {
display: inline-block;
}
#alimama-footer .alimama-info a,
#alimama-footer .alimama-info span {
margin-right: 20px;
}
#alimama-footer .alimama-info b {
display: none;
font-weight: 400;
color: #DDD;
}
#alimama-footer .aligroup-info b {
margin: 0 8px;
font-weight: 400;
color: #DDD;
}
.ualificate-info {
margin-top: 8px;
}
.ualificate-info a {
margin-right: 20px;
}
.ualificate-info a img {
border: none;
-webkit-filter: grayscale(100%);
-moz-filter: grayscale(100%);
-ms-filter: grayscale(100%);
-o-filter: grayscale(100%);
filter: grayscale(100%);
filter: gray;
}
.ualificate-info a:hover img {
-webkit-filter: grayscale(0%);
-moz-filter: grayscale(0%);
-ms-filter: grayscale(0%);
-o-filter: grayscale(0%);
filter: grayscale(0%);
filter: none;
}
@media screen and (max-width:767px) {
#alimama-footer {
padding: 30px 12px;
line-height: 2;
}
.aligroup-info {
display: none;
}
.ualificate-info {
display: none;
}
#alimama-footer .alimama-info .link {
display: -webkit-flex;
display: flex;
-webkit-justify-content: space-between;
justify-content: space-between;
}
#alimama-footer .alimama-info .copyright {
display: block;
}
#alimama-footer .alimama-info .copyright a,
#alimama-footer .alimama-info .copyright span {
display: block;
text-align: center;
}
#alimama-footer .alimama-info .copyright b {
display: none;
}
#alimama-footer .alimama-info a,
#alimama-footer .alimama-info span {
margin-right: 0;
}
#alimama-footer .alimama-info b {
display: inline-block;
}
}
</style><script type="text/javascript" src="//lego.alicdn.com/mm/lego2??cell-pc-frame/0.0.14/index.js,pc-keyword-list/0.0.2/index.js,pc-search-list/0.0.6/index.js,pc-search/0.0.15/index.js"></script><script>! function () {
window.fsViews = [];
var fsViewRendered = false;
var firstViewRangeInfo;
var swapfsView = function () {
if (!window._IS_IN_EDITOR_) {
//懒渲染组件check,可视化下不做组件懒渲染
$('.main-container').on('after-render', function () {
fsViews.forEach(function (view, index) {
Glazy.swap(view);
});
fsViewRendered = true;
//收集非首屏异步组件
window.Glazy.collect(null, '[data-mxview]');
});
}
};
if (window.Glazy) {
//收集懒加载图片
window.Glazy.collect(null, '[data-src]').check();
var firstViewRangeInfo = window.Glazy.getViewRangeInfo();
//收集首屏懒渲染组件
if (!window._IS_IN_EDITOR_) {
swapfsView();
}
}
// after render
var oldExtend = Magix.View.extend;
Magix.View.extend = function (props) {
if (props && props.render) {
var oldRender = props.render;
props.render = function () {
var $el = $(this.$el);
var vf = Magix.Vframe.get(this.id);
var $v = vf.$v || {};
if ($el.length < 1) {
$el = $('#' + this.id);
}
var viewName = $el.attr('mx-view') || $el.attr('mx-view-rendered');
var mayNeedLazy = window.Glazy && viewName && viewName.indexOf('cell') !== 0;
if (mayNeedLazy && !$v.skipCheck) {
var viewRangeInfo = fsViewRendered ? window.Glazy.getViewRangeInfo() :
firstViewRangeInfo;
}
oldRender.apply(this, arguments);
if (mayNeedLazy) {
window.requestAnimationFrame(function () {
!$v.skipCheck && window.Glazy.check($el, viewRangeInfo);
!$v.skipCollect && window.Glazy.collect($el);
});
}
$el.triggerHandler('after-render');
};
}
return oldExtend.apply(this, arguments);
};
}();
Magix.config({
rootId: 'J_u_root'
});
Magix.boot();
window.initTrack && window.initTrack("" ||
'pid:mm_10011550_0_0,relationId,union_lens:union_lens(),unid,clk1');
window.sendPV && window.sendPV({
union_lens: Util.getSearchObj().union_lens
});</script></div></body></html>
这是搜索某一商品后的页面,可以看到在这个html页面中没有相关商品的名称价格等信息,但是在网页本页面,点击“检查”,部分区域效果是这样的:
<span class="title-text">FILA FUSION斐乐潮牌背包情侣2023秋双肩包<span class="H">书包</span>电脑包男士</span>
<span class="coupon-price-title">¥</span><span class="coupon-price-afterCoupon">549.00</span><span class="coupon-price-old">¥580.00</span>
又可以看到,在“检查”模式下,是有题主需要的信息的。
题主出于好奇,又去翻了京东的搜索页面,同样进行“查看源代码”操作,情况是这样的(片段截取):
<div class="p-price">
<strong class="J_10059966670473" data-presale="0" data-done="1" >
<em>¥</em><i data-price="10059966670473">79.00</i>
</strong>
</div>
<div class="p-name p-name-type-2">
<a target="_blank" title=""
href="//item.jd.com/10059966670473.html"
onclick="searchlog(1, '10059966670473','1','1','','flagsClk=2097575');">
<em>回力双肩包男士背包大容量时尚休闲旅行包电脑包高中大学生初中<font> class="skcolor_ljg">书包</font>女 黑色随机挂件</em>
和淘宝不同的是,京东的html源代码界面是能得到题主需要的信息的,题主对于这两种情况非常好奇,希望蹲个大佬解答以下几个问题:
-
淘宝的页面是怎么组织的,为什么“检查”操作和“查看源代码”操作得到的html文件完全不同?
-
对于淘宝的这种页面组织方式,是否有办法可以获取到题主想要的信息?
欢迎大家的讨论,先行谢过大佬花费时间的解答🙏