Python爬虫学习之抓取商品名称和价格

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/iTechzero/article/details/53537733

看到网上有可以查看商城历史价格的网站,有时候查查某件想买的商品是不是历史最低价,还是蛮有用的。用了几次后就想着这种网站的功能是怎么实现的,那就是使用爬虫技术,定时爬取这些商城的商品价格,然后保存到数据库里去,以后爬取的时候如果价格没变就跳过,如果价格有变动就将变动的时间和价格插入数据库,日积月累,以后就能查找这些商品历史价格了。

原理在这了,然后想着怎么实现。开始用PHP做了一个爬取京东的小爬虫,能跑起来,就是效率太低了,因为PHP是单线程的。在网上看到爬虫都是用python写的,然后就花了3天时间学习python,做了一个简单的京东商品名称和价格的小爬虫,先把代码放上来,有时间再整理下这个小爬虫的工作流程。

附上源码供参考:

> import re 
> import urllib  
> import urllib.request  
> import urllib.parse
from collections import deque   
> import threading
> import time
> import pymysql  
> import datetime    
> g_this_date = datetime.date.today()  
> g_db = pymysql.connect(user='root', passwd='', host='localhost', db='qw', charset='utf8')
> g_db_cursor = g_db.cursor()
> g_db_sql_start = "INSERT INTO qi_jd_goods(goods_id,name,price,date_add) VALUES "
> g_db_sql_val = ''
> 
> g_queue = deque()       # 爬虫队列 
> g_visited_list = set()  # 爬过的列表 
> g_visited_item = set()  # 爬过的商品 
> g_cnt_list_all = 0      # 爬过的列表总数 
> g_cnt_list_sec = 0      # 爬过的列表成功总数 
> g_cnt_item_all = 0      # 爬过的商品总数 
> g_cnt_item_sec = 0      # 爬过的商品成功总数  
> 
> class SpiderJd(): 
>     def go(self, tid, q):
>         global g_queue
>         global g_visited_list
>         global g_visited_item
>         global g_cnt_list_all
>         global g_cnt_list_sec
>         global g_cnt_item_all
>  
>         if q:
>             this_url = q     # 队首元素出队
>             g_visited_list |= {this_url}          # 标记为已访问
>             g_cnt_list_all += 1
>             data = self.get_data(this_url)
>             if data == None:
>                 return
>             g_cnt_list_sec += 1
>             # 正则表达式提取页面中所有队列, 并判断是否已经访问过, 然后加入待爬队列
>             re_link = re.compile('href="(.+?)"')
>             link_all = re_link.findall(data)
>             for i in link_all:
>                 i_parse = urllib.parse.urlparse(i)
>                 if i_parse.path == '/list.html' and i not in g_visited_list:
>                     g_queue.append(i)
>                 if i_parse.netloc == 'item.jd.com':
>                     try:
>                         i_id = str(re.compile('/(\d+)').findall(i_parse.path)[0])
>                         if i_id not in g_visited_item:
>                             g_visited_item |= {i_id}
>                             g_cnt_item_all += 1
>                             self.get_item(tid, i_id)
>                     except:
>                         continue
>  
>     def get_item(self, tid, gid):
>         global g_cnt_item_sec
>         global g_this_date
>         this_name = self.get_name(gid)
>         this_price = self.get_price(gid)
>         g_cnt_item_sec += 1
>         # print('tid=%s, cnt_l_a=%d, cnt_l_s=%d, cnt_i_a=%d, cnt_i_s=%d, id=%s, price=%s' % (tid, g_cnt_list_all, g_cnt_list_sec,
> g_cnt_item_all, g_cnt_item_sec, gid, this_price))
>         print('tid=%s, cnt_l_a=%d, cnt_l_s=%d, cnt_i_a=%d, cnt_i_s=%d, id=%s, name=%s, price=%s' % (tid, g_cnt_list_all, g_cnt_list_sec,
> g_cnt_item_all, g_cnt_item_sec, gid, this_name, this_price))
>         pymysql_insert("('"+gid+"',\""+this_name+"\",'"+str(this_price)+"','"+str(g_this_date)+"'),")
>  
>     def get_data(self,url):
>         try:
>             urlop = urllib.request.urlopen(url, timeout=2)
>         except:
>             return None
>         if 'html' not in urlop.getheader('Content-Type'):
>             return None
>         try:
>             this_data = urlop.read()
>         except:
>             return None
>         try:
>             if 'UTF-8' in urlop.getheader('Content-Type') or 'utf-8' in urlop.getheader('Content-Type'):
>                 data = this_data.decode("UTF-8")
>             elif 'GBK' in urlop.getheader('Content-Type') or 'gbk' in urlop.getheader('Content-Type'):
>                 data = this_data.decode("GBK")
>             else:
>                 data = this_data.decode("UTF-8")
>         except:
>             return None
>         return data
>  
>     def get_price(self,gid):
>         price_url = "http://p.3.cn/prices/mgets?skuIds=J_"+gid+"&type=1"
>         try:
>             price_json = json.loads(urllib.request.urlopen(price_url).read().decode("gbk"))[0]
>             if price_json['p'] and price_json['p'] != '-1.00':
>                 return price_json['p']
>         except:
>             return 0
>         return 0
>  
>     def get_name(self,gid):
>         url = "http://item.jd.com/"+gid+".html"
>         data = self.get_data(url)
>         if data != None:
>             re_name = re.compile('<div id="name">\s*<h1>(.+?)</h1>')
>             match = re_name.findall(data)
>             if len(match) != 0:
>                 return match[0]
>         return None   class myThread (threading.Thread):
>     def __init__(self, tid):
>         threading.Thread.__init__(self)
>         self.tid = tid
>  
>     def run(self):
>         global g_queue
>         print("Starting " + self.tid)
>         while g_queue:
>             queueLock.acquire()
>             q = g_queue.popleft()
>             queueLock.release()
>  
>             s.go(self.tid,q)
>         print("Exiting " + self.tid)   def pymysql_insert(val):
>     global g_cnt_item_sec
>     global g_db_sql_start
>     global g_db_sql_val
>     global g_db
>     global g_db_cursor
>     if g_cnt_item_sec%100 == 0 and len(g_db_sql_val) != 0:
>         sql = g_db_sql_start + g_db_sql_val[:-1]
>         g_db_sql_val = ''
>         try:
>             # 执行sql语句
>             g_db_cursor.execute(sql)
>             # 提交到数据库执行
>             g_db.commit()
>         except:
>             # Rollback in case there is any error
>             g_db.rollback()
>     else:
>         g_db_sql_val += val
>  
>  
>     if __name__ == '__main__':
>     print(datetime.datetime.today())    # 开始时间
>     queueLock = threading.Lock()
>     url = "http://www.jd.com/allSort.aspx"
>     g_queue.append(url)
>  
>     s = SpiderJd()
>  
>     count = 1
>     while (count < 10):
>         if count != 1:
>             time.sleep(2)
>         thread = myThread(str(count))
>         thread.start()
>         count = count+1
>     print(datetime.datetime.today())    # 结束时间

没有更多推荐了,返回首页