#coding=utf-8
import re
import urllib
import urllib2
import string
import Queue
import sys
from pybloomfilter import BloomFilter
furl1=BloomFilter(capacity=100000,error_rate=0.0001)
class Tianmao:
def __init__(self):
self.urlqueue=Queue.Queue(0)
self.titqueue=Queue.Queue(0)
def Getpage(self,url):
headers = {
'Host':'list.tmall.com',
'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/35.0',
'Referer':'http://list.tmall.com'
}
req = urllib2.Request(url,headers=headers)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
response = opener.open(req)
myPage=response.read()
response.close()
return myPage
#def Write(self,urls,title):
def Dealpage(self):
url1='http://list.tmall.com/search_product.htm?'
key=raw_input('please input key')
key=key.decode('utf-8').encode('gbk')
key =urllib.quote(key )
url3=url1+'q='+key+'&sort=s&style=g&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'
print url3
myPage=self.Getpage(url3)
f=open('www'+'.html','w+')
f.write(myPage)
f.close()
data=re.findall('<p class="productTitle">\r\n\r\n<a href="(.*?)" target="_blank" title="(.*?)"',myPage,re.S)
for i in data:
print i[0]
print i[1].decode('gbk').encode('utf-8')
j=1
while j<100:
p=60*j
p=str(p)
url=url1+'&s='+p+'&'+'q='+key +'&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'
print url
try:
myPage=self.Getpage(url)
except urllib2.HTTPError:
print '404'
except Exception,e:
print e
data=re.findall('<p class="productTitle">\r\n\r\n<a href="(.*?)" target="_blank" title="(.*?)"',myPage,re.S)
for i in data:
p=i[0]
p=p.replace('amp;','')
print p
q=i[1].decode('gbk').encode('utf-8')
print q
#raw_input('sss')
j=j+1
tianmao=Tianmao()
tianmao.Dealpage()
import re
import urllib
import urllib2
import string
import Queue
import sys
from pybloomfilter import BloomFilter
furl1=BloomFilter(capacity=100000,error_rate=0.0001)
class Tianmao:
def __init__(self):
self.urlqueue=Queue.Queue(0)
self.titqueue=Queue.Queue(0)
def Getpage(self,url):
headers = {
'Host':'list.tmall.com',
'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:33.0) Gecko/20100101 Firefox/35.0',
'Referer':'http://list.tmall.com'
}
req = urllib2.Request(url,headers=headers)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
response = opener.open(req)
myPage=response.read()
response.close()
return myPage
#def Write(self,urls,title):
def Dealpage(self):
url1='http://list.tmall.com/search_product.htm?'
key=raw_input('please input key')
key=key.decode('utf-8').encode('gbk')
key =urllib.quote(key )
url3=url1+'q='+key+'&sort=s&style=g&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'
print url3
myPage=self.Getpage(url3)
f=open('www'+'.html','w+')
f.write(myPage)
f.close()
data=re.findall('<p class="productTitle">\r\n\r\n<a href="(.*?)" target="_blank" title="(.*?)"',myPage,re.S)
for i in data:
print i[0]
print i[1].decode('gbk').encode('utf-8')
j=1
while j<100:
p=60*j
p=str(p)
url=url1+'&s='+p+'&'+'q='+key +'&from=mallfp..pc_1_searchbutton&type=pc#J_Filter'
print url
try:
myPage=self.Getpage(url)
except urllib2.HTTPError:
print '404'
except Exception,e:
print e
data=re.findall('<p class="productTitle">\r\n\r\n<a href="(.*?)" target="_blank" title="(.*?)"',myPage,re.S)
for i in data:
p=i[0]
p=p.replace('amp;','')
print p
q=i[1].decode('gbk').encode('utf-8')
print q
#raw_input('sss')
j=j+1
tianmao=Tianmao()
tianmao.Dealpage()