环境:win32 python2.7
'''frombs4 importBeautifulSoup
importrequests
importMySQLdb
importdatetime
#编码importsys
reload(sys)
sys.setdefaultencoding('utf-8')
#连接数据库,并插入爬到的数据definsert_db(page_list):
try:
#注意链接时加上charset='utf8'解决编码问题conn = MySQLdb.connect(user='root', passwd='112233aa',host='192.168.1.14',db='miranda.tang',charset='utf8')
cursor = conn.cursor()
#删除当日已插入数据,避免重复插入cursor.execute('DELETE FROM yixun_price_refrigerator
WHERE update_day=CURRENT_DATE()')
conn.commit() #提交#用executemany一次性提交爬取数据,比直接用execute快sql='INSERT INTO yixun_price_refrigerator
values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'cursor.executemany(sql,
page_list)
conn.commit() #提交cursor.close() #关闭cursorconn.close() #关闭连接exceptException ase:
printe
conn.rollback()
#得到soupdefurlBS(url):
response=requests.get(url)
soup = BeautifulSoup(response.text,"lxml")
returnsoup
#得到一共有多少页defget_pagenumber(url):
soup=urlBS(url)
page=soup.select('.sort_page_num span')[0]
page_contents=page.contents[1]
pagenumber=int(page_contents.replace('/',''))
returnpagenumber
#得到页面信息defget_info(product_url):
soup=urlBS(product_url)
# print soup
#get titletitle = unicode(soup.title.text.strip().strip(u'【价格_报价_图片_行情】-易迅网').replace(u'】',''))\
.encode('utf-8').decode('utf-8')
#print title
#get_原价try:
soup_origin = soup.find("dl", { "class": "xbase_item xprice xprice_origin"})
price_origin = soup_origin.find("span", { "class": "mod_price xprice_val"}).\
contents[1].text.encode('utf-8').decode('utf-8')
# print u'原价:' + price_originexcept:
price_origin=0
#pass
#get现价try:
soup_sale= soup.find('dl',{'class':'xbase_item xprice'})
price_sale = soup_sale.find("span", { "class": "mod_price xprice_val"}).contents[1].encode('utf-8').decode('latin1')
#print u'现价:'+ price_saleexcept:
price_sale=0
#pass
#得到列名名称oup_info_name=soup.find_all('td',{'class':'name'})
# for each in
oup_info_name:
#
print each.contents[0].encode('utf-8').decode('utf-8')name_list=[each.contents[0].encode('utf-8').decode('utf-8') foreach inoup_info_name]
#得到内容soup_info_desc=soup.find_all('td',{'class':'desc'})
# for each in
soup_info_desc:
#prod_list=[soup_info_desc[0].contents[0].encode('utf-8').decode('latin1')]prod_list=[each.contents[0].encode("utf-8").decode("utf-8") foreach insoup_info_desc] #用列表生成式将原表格中的数据放入列表中pro_dic={}
pro_list=[today,product_url,title,price_origin,price_sale]
#因为列名爬取数据中不分数据是没有的,通过字典的方式,把没有的数据记录为空fori inrange(len(name_list)):
pro_dic[name_list[i]]=prod_list[i]
name=['品牌','型号','颜色','能效等级','冰箱容积','制冷方式','门款式','重量','尺寸','制冷类型',
'显示方式','定频/变频','除霜模式', '冷冻室温度区间','冷藏室温度区间','冰箱冷柜机型','操作方式']
foreach inname:
try:
each=each.encode("utf-8").decode("utf-8")
pro_list.append(pro_dic[each])
pro_dic[each]except:
pro_list.append('')
# print 'null'
# print pro_list
# print len(pro_list)page_list.append(pro_list)
#得到商品页链接defget_product_href(url):
soup=urlBS(url)
product_list=soup.select('#itemList .mod_goods_img a')
product_listfori inrange(len(product_list)):
pro=product_list[i]
pro_href=pro['href']
# return pro_href
#print pro_hrefget_info(pro_href)
if__name__=='__main__':
beseurl='http://searchex.yixun.com/html?path=705882t705892&attr=42515e1o2o3o4o5o6o7'max_number=get_pagenumber(beseurl)
page_list=[]
today=datetime.date.today() #得到当前日期,插入更新日期fori inrange(1,max_number+1):
# for i in
range(1,2):newurl=beseurl+'&page='+str(i)
#print newurlget_product_href(newurl)
insert_db(page_list)
print("It's all done")
#建表# drop table yixun_price_refrigerator;
# CREATE TABLE yixun_price_refrigerator(
# update_day date --更新日期# ,product_url VARCHAR(300) --商品链接# ,title VARCHAR(300) --名称# ,price_origin VARCHAR(100) --原价# ,price_sale VARCHAR(100) --现价# ,Brands VARCHAR(100) --品牌# ,Goods_sn VARCHAR(100) --型号# ,Colour VARCHAR(100) --颜色# ,Energy_efficiency_rating VARCHAR(100)
--能效等级# ,Refrigerator_volume VARCHAR(100)
--冰箱容积# ,Refrigeration VARCHAR(100) --制冷方式# ,Door_style VARCHAR(100) --门款式# ,weight VARCHAR(100) --重量# ,size VARCHAR(100) --尺寸# ,Cooling_type VARCHAR(100) --制冷类型# ,Display_method VARCHAR(100) --显示方式# ,frequency VARCHAR(100) --定频/变频# ,Defrost_mode VARCHAR(100) --除霜模式# ,Freezer_temperature_range VARCHAR(100)
--冷冻室温度区间# ,Save_temperature_range VARCHAR(100)
--冷藏室温度区间# ,Fridge_freezer_models VARCHAR(100)
--冰箱冷柜机型# ,Operation_method VARCHAR(100)
--操作方式# );