简单的模拟天猫搜索并遍历一百页商品（会遇到反爬机制）

最新推荐文章于 2024-03-25 10:08:44 发布

我写爬虫

最新推荐文章于 2024-03-25 10:08:44 发布

阅读量1.5k

点赞数

文章标签： python python 爬虫天猫数据挖掘 ubuntu

本文链接：https://blog.csdn.net/u012328712/article/details/44062901

版权

#coding=utf-8
import re
import urllib
import urllib2
import string
import Queue
import sys
from pybloomfilter import BloomFilter
furl1=BloomFilter(capacity=100000,error_rate=0.0001)
class Tianmao:
    def __init__(self):
   self.urlqueue=Queue.Queue(0)
        self.titqueue=Queue.Queue(0)
    def Getpage(self,url):
   headers = {
        'Host':'list.tmall.com',
        'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/35.0',

        'Referer':'http://list.tmall.com'

        }
   req = urllib2.Request(url,headers=headers)
   opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
   response = opener.open(req)
   myPage=response.read()
   response.close()

   return myPage

    #def Write(self,urls,title):
    def Dealpage(self):
   url1='http://list.tmall.com/search_product.htm?'
   key=raw_input('please input key')
   key=key.decode('utf-8').encode('gbk')
   key =urllib.quote(key )


   url3=url1+'q='+key+'&sort=s&style=g&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'
   print url3
   myPage=self.Getpage(url3)
   f=open('www'+'.html','w+')
   f.write(myPage)
   f.close()
   data=re.findall('<p class="productTitle">\r\n\r\n<a href="(.*?)" target="_blank" title="(.*?)"',myPage,re.S)
   for i in data:
        print i[0]
        print i[1].decode('gbk').encode('utf-8')


   j=1
   while j<100:
        p=60*j
        p=str(p)
        url=url1+'&s='+p+'&'+'q='+key +'&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'
        print url
        try:
       myPage=self.Getpage(url)

        except urllib2.HTTPError:
       print '404'
        except Exception,e:
       print e
        data=re.findall('<p class="productTitle">\r\n\r\n<a href="(.*?)" target="_blank" title="(.*?)"',myPage,re.S)

        for i in data:
       p=i[0]
       p=p.replace('amp;','')
       print p
       q=i[1].decode('gbk').encode('utf-8')
       print q
        #raw_input('sss')
        j=j+1
tianmao=Tianmao()
tianmao.Dealpage()