import datetime import time import pymysql import requests import mysqlConcent import os.path import re from scrapy import Selector from PIL import Image from threading import Thread productlist=[] productdetail=[] pics=[] url = 'http://www.ddddd.com/' mysqldb=mysqlConcent.MysqlLocal() class GetProductList(Thread): def run(self): while 1: try: url = productlist.pop() # time.sleep(0.5) except IndexError as ie: # time.sleep(0.5) continue classify_id = re.search('\d+',url).group() list_info = requests.get(url).text sel = Selector(text=list_info) getgooditem = sel.xpath("//div[@class='goodsItem']").extract() if getgooditem: for j in getgooditem: sell = Selector(text=j) urls = sell.xpath("//a/@href").extract()[0:1] title = sell.xpath("//div/p/a/@title").extract()[0] face_img = sell.css('img::attr(src)').extract()[0] idstr = sell.xpath("//div/a/@href").extract() product_id = re.search('\d+', idstr[0]).group() price_str = sell.xpath("//div/font/text()").extract() price = re.search('[0-9]{1,}', price_str[0]).group() detail_url = url + urls[0] param = {} param['id'] = product_id param['title'] = title param['price'] = price param['face_pic'] = face_img param['classify_id'] = classify_id param['status'] = 'Y' param['date_create'] = mysqldb.getSysTime() # 保存图片到本地 pics.append(face_img) # 将抓取的信息新增到商品信息表 flg = mysqldb.addOneDataToTable(table_name='product', data=param,flags='1') if flg: print("新增成功,返回的ID是{}".format(flg)) detail={} detail['url'] = detail_url detail['product_id'] = classify_id productdetail.append(detail) continue #获取下一页的信息 try: pagebar = sel.css(".pagebar .next::attr('href')").extract() if pagebar: productlist.append(url+pagebar[0]) except Exception as e: print('获取下一页失败{}'.format(e)); class SaveImageToLocal(Thread): def run(self): while 1: try: img_url = pics.pop() if os.path.exists(img_url): print('目标文件已存在%s' % (img_url)) continue # time.sleep(0.5) except Exception: # time.sleep(0.5) continue picUrl = url + img_url # 获取图片内容 res = requests.get(picUrl).content # 获取图片路径目录 getPicDir = os.path.dirname(img_url) # 判断上传的目录是否存在 if not os.path.isdir(getPicDir): os.makedirs(getPicDir) # 判断上传的目标文件是否存在 print('正在创建文件%s' % (img_url)) with open(img_url, 'wb') as fp: fp.write(res) class SaveProductDetail(Thread): def run(self): while 1: try: param = productdetail.pop() # time.sleep(0.5) except Exception: # time.sleep(0.5) continue url = param['url'] product_id=param['product_id'] print('product的url%s,id是%s' % (url,product_id)) detail = requests.get(url).text sel = Selector(text=detail) btnul = sel.css(".bnt_ul").extract()[0] aaa = sel.xpath("//label/text()").extract() bbb = str(aaa).replace(' ', '').replace('\\n', '').strip('[\n]') try: # 获取颜色 color = '' if bbb: color = re.findall('[\u4e00-\u9fa5]{1,}', bbb)[0] # 获取尺寸 size = '' if 'CM' in bbb or 'cm' in bbb: # size = re.findall('\d+[a-zA-Z]{1,}', bbb)[0]\*\d+\S+[a-zA-Z]{1,}\S size = re.findall('(\d+[a-zA-Z]{1,}|\d+\*\d+\S+[a-zA-Z]{1,}\S|\d+\/\d+\S+\))', bbb)[0] pass except: print(url) # 获取详情ID product_detail_id = re.search('\d+', url).group() product_detail = sel.xpath("//div[@id='com_h']/blockquote/p").extract() product_detail_imgs = sel.xpath("//div[@id='com_h']/blockquote/p//img/@src").extract() content = ''.join((jk for jk in product_detail)) # 保存详情的图片至本地 if product_detail_imgs: for img in product_detail_imgs: detail_img = img if img[0] == '/': detail_img = str(img).strip('/') print(detail_img) pics.append(detail_img) #ceshi # continue param = {} param['product_id'] = product_id param['product_detail_id'] = product_detail_id param['size'] = size param['color'] = color param['content'] = content.replace('\'', '\"') param['date_create'] = mysqldb.getSysTime() id = mysqldb.addOneDataToTable(table_name='product_detail', data=param) print(id) if __name__=='__main__': result = requests.get(url) sel = Selector(text=result.text) tag = sel.xpath("//div[@class='menu']/a/@href").extract()[1:7] for i in tag: productlist.append(url + i) #商品列表 t1=GetProductList() #获取商品图片 t2=SaveImageToLocal() #获取商品详情 t3=SaveProductDetail() t1.start() t2.start() t3.start()
Python多线程爬取
最新推荐文章于 2024-09-15 22:31:42 发布