1. 淘宝商品信息定向爬虫
2. 实例编写
2.1 整体框架
# -*- coding: utf-8 -*- import requests import re def getHTMLText(url): print("") # 对获得的每个页面进行解析 def parsePage(ilt, html): print("") #将商品信息输出 def printGoodsList(ilt): print("") def main(): goods = '书包' depth = 2 start_url = 'http://s.taobao.com/search?q=' + goods infoList = [] # 输出结果 for i in range(depth): try: url = start_url + '&s=' + str(44*i) # 44*i对于第一个页面,以44为倍数 html = getHTMLText(url) parsePage(infoList, html) except: continue # 异常,就下一页继续 printGoodsList(infoList) if __name__ == '__main__': main()
2.2 获取HTML
def getHTMLText(url): # print("") try: coo = 'cna=tdBCFfDBNAMCAd9okXkZ1GL3; miid=112621671462202524; t=44589a73c162d6acda521ff61a2b0495; tracknick=%5Cu90AA%5Cu8272%5Cu73AB%5Cu7470; tg=0; thw=cn; cookie2=118b55b0763bd8b114d620eea8d4aad6; v=0; _tb_token_=e63a37eb038d7; _samesite_flag_=true; _m_h5_tk=bd2dfdb57cb705d14afac8d51692b104_1580480469597; _m_h5_tk_enc=75684835cca4377aa7705414649de248; hng=CN%7Czh-CN%7CCNY%7C156; lgc=%5Cu90AA%5Cu8272%5Cu73AB%5Cu7470; dnk=%5Cu90AA%5Cu8272%5Cu73AB%5Cu7470; enc=jeabZ6RdKJ8atsmP5bmmuXkTQIp4FisJU2OrrhnHtayrgkI%2FtsUaeXsHutbc9MTCk7L0WNdijmNDWJn0o%2Bw