抓取蘑菇街商品数据主要用于研究,不用于商业用途。
so
1.获取分类列表页面,如 洁面
2.查看这个页面的源代码会发现主要的数据请求都是异步Ajax实现的,上面页面就是一个空的框架壳子,所以我们需要找到Ajax请求地址
3.
4.去页面代码中找到实现这个新的请求地址的代码
<!-- ajax 参数 param-name里填参数名 value填参数值-->
<input type="hidden" class="ajax_url" value="http://list.mogujie.com/search" />
<input type="hidden" class="ajax_param" param-name="action" value="magic" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="currentId" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="preId" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="cKey" value="pc-wall-v1" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="fcid" value="51899" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="page" value="1" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="sort" value="pop" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="fid" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="minPrice" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="maxPrice" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="itemMarks" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="userMarks" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="title" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="f" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="ad" value="2" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="stitle" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="tag" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="book" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="section" value="0" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="cpc_offset" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="showW" value="220" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="showH" value="330" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="width" value="220" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="height" value="330" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="ptpPartC" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="_mgjuuid" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="is_admin" value="" autocomplete="off"/>
<input type="hidden" class="ajax_param" param-name="userId" value="" autocomplete="off"/>
5.很好,我们只需要利用这些hidden元素生成url就行了
self.re_ajax_params = r"<input type=\"hidden\" class=\"ajax_param\" param-name=\"(.*?)\" value=\"(.*?)\""
def get_ajax_url(self, content, item):
params = get_re_findall(content, self.re_ajax_params)
url = "http://list.mogujie.com/search?"
url += "&".join([p[0] + "=" + p[1] for p in params])
6.于是剩下的事情就很简单了,直接利用生成的url抓取json就行了。
代码片段
@gen.coroutine
def get_links_from_url(self, item, proxy_ip=None):
try:
request = httpclient.HTTPRequest(url=item[0], method="GET", request_timeout=300)
if proxy_ip is not None:
request.proxy_host, request.proxy_port = proxy_ip.split(":")
if not isinstance(request.proxy_port, int): request.proxy_port = int(request.proxy_port)
curl_httpclient.AsyncHTTPClient.configure(curl_httpclient.CurlAsyncHTTPClient)
response = yield curl_httpclient.AsyncHTTPClient().fetch(request)
html = response.body if isinstance(response.body, unicode) else response.body.decode("utf-8")
urls = []
#html
if item[1] == 0:
urls = self.get_ajax_url(html, item)
elif item[1] == 1: # json
if item[2] == 1: # first page
urls = self.parse_next_page(html)
for url in urls:
self.db.insertItemWithKey(self.list_fetching_table,{"url":url}, {"p1":url[1],"p2":url[2]})
self.parse_json_obj(html)
else: # page 1..N
self.parse_json_obj(html)
print urls
print('fetched %s' % (item[0]))
self.db.insertItemWithKey(self.list_fetched_table, {"url": item[0]},
{"url": item[0], "update_time": get_current_timestamp()})
self.db.remove(self.list_fetching_table, {"url": item[0]})
except Exception as e:
print('Exception: %s %s' % (e, item[0]))
raise gen.Return((False,[]))
raise gen.Return((True,urls))