第一步:点击上图中“网络”标签,然后刷新或载入页面
第二步:在右侧“标头”下方的“请求标头”中的所有信息都是headers内容,添加到requests请求中即可
import urllib2,os
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
#self.links = {}
def handle_starttag(self, tag, attrs):
#print "Encountered the beginning of a %s tag" % tag
if tag == "img":
if len(attrs) == 0: pass
else:
for (variable, value) in attrs:
if variable == "src" and value[0:4] == 'http' and value.find('x') >= 0:
pic_name = value.split('/')[-1]
print pic_name
down_image(value, pic_name)
def down_image(url,file_name):
global headers
req = urllib2.Request(url = url,headers=heagers)
binary_data = urllib2.urlopen(req).read()
temp_file = open(file_name, 'wb')
temp_file.write(binary_data)
temp_file.close()
#
#
if __name__ == "__main__":
img_dir = "/home/ubuntu/mypapercode/workpiece"
if not os.path.isdir(img_dir):
os.mkdir(img_dir)
os.chdir(img_dir)
print os.getcwd()
url = ""
headers = {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0'}
all_links = []
hp = MyHTMLParser()
for i in range(1,30):
url = 'https://www.1688.com/pic/-B9A4BCFECDBCC6AC.html' + str(i) + '/'
req = urllib2.Request(url = url,headers = headers)
content = urllib2.urlopen(req).read()
hp.feed(content)
hp.close()