1 import requests 2 import re 3 4 def get_html_text(url): 5 try: 6 r = requests.get(url, timeout=30) 7 r.raise_for_status() 8 r.encoding = 'utf-8' 9 return r.text 10 except: 11 return "" 12 13 def parse_page(ilt, html): 14 try: 15 plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html) 16 tlt = re.findall(r'\"raw_title\"\:\".*?\"', html) 17 for i in range(len(plt)): 18 price = eval(plt[i].split(':')[1]) 19 title = eval(tlt[i].split(':')[1]) 20 ilt.append([price, title]) 21 except: 22 print("") 23 24 def clear_content(filename): 25 f = open(filename, 'w') 26 f.truncate() 27 f.close() 28 29 def save_goods(filename, goods_list): 30 pattern = "{:4}\t{:8}\t{:16}" 31 with open(filename, 'a', encoding='utf-8') as f: 32 f.write(pattern.format("序号", "价格", "商品名称") + '\n') 33 count = 0 34 for goods in goods_list: 35 count = count + 1 36 f.write(pattern.format(count, goods[0], goods[1]) + '\n') 37 f.close() 38 39 def main(): 40 goods = '零食' 41 depth = 2 42 start_url = 'http://s.taobao.com/search?q=' + goods 43 goods_list = [] 44 for i in range(depth): 45 try: 46 url = start_url + '&s=' + str(44*i) 47 html = get_html_text(url) 48 parse_page(goods_list, html) 49 except: 50 continue 51 filename = "taobao_snacks.txt" 52 clear_content(filename) 53 save_goods(filename, goods_list) 54 print("商品信息已保存到taobao_snacks.txt") 55 56 main()
转载于:https://www.cnblogs.com/songfei90/p/10523126.html