python获取id标签对应数据_Python爬虫:爬取某网站关键词对应商品ID,且存入DB2数据库...

import requests

from lxml import etree

import ibm_db

import logging

import sys

import time

import smtplib

'''遇到python不懂的问题,可以加Python学习交流群:1004391443一起学习交流,群文件还有零基础入门的学习资料'''

#配置写入日志

logging.basicConfig(level=logging.INFO,

format='%(asctime)s%(filename)s[line:%(lineno)d]%(levelname)s%(message)s',

datefmt='%a,%d%b %Y %H:%M:%S',

filename='keywords_weekly.log',

filemode='a')

#编码

reload(sys)

sys.setdefaultencoding('utf-8')

# 解决服务器限制访问问题

def get_url_data(url,headers,max_tries=10):

remaining_tries = max_tries

while remaining_tries > 0:

try:

return requests.get(url,headers=headers)

except requests.exceptions:

time.sleep(60)

remaining_tries = remaining_tries - 1

raise Exception("Couldn't get the url_data.")

#写入db2

def write_db2(resultdict):

rank=resultdict['rank']

#由于中文编码问题,关键词直接用update的方法更新

# keywords=resultdict['keywords']

uv=resultdict['uv']

frequency=resultdict['frequency']

goods_id=resultdict['goods_id']

sql_in="insert into T_KEYWORDS_weekly(K_RANK,UV,FREQUENCY,GOODS_ID,week_YEAR)" \

" values (%r,%r,%r,%r,year(current date)||'-'||WEEK_ISO(current date))" % (rank,uv,frequency,goods_id)

ibm_db.exec_immediate(conn, sql_in)

ibm_db.commit(conn)

# #翻页

def get_html(keywords):

#keywords="沙发"

user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/31.0.1650.63 Safari/537.36'

headers = { 'User-Agent' : user_agent }

#网址被我隐藏了哈,可以猜猜

url='http://www.XXX.com/category-9999/list-p1/?fl=q&keywords=%s'%keywords

html = get_url_data(url,headers)

html_list=[html]

selector = etree.HTML(html.text)

#page_f="http://www.meilele.com"

#得到共有多少页,循环得到各个页面的url

page_e=selector.xpath('/html/body/div[@class="page-panel"]/div/div/span[@class="p-info"]/b/text()')

if page_e:

for i in range(2,int(page_e[0])+1):

url_temp='http://www.meilele.com/category-9999/list-p%d/?fl=q&keywords=%s'%(i,keywords)

html_temp=requests.get(url_temp,headers=headers)

html_list.append(html_temp)

return html_list

# #获取内容

def get_id(dictionary):

keywords=dictionary[1]

html_list=get_html(keywords)

logging.info("get the html_list%ssuccessfully" %keywords)

for each in html_list:

html=each

selector = etree.HTML(html.text)

result={}

try:

content_field = selector.xpath('//*[@id="JS_list_panel"]/div[@class="w list-wrap"]/ul[@class="list-goods clearfix"]')[0]

except:

content_field=[]

result['rank']=str(dictionary[0])

# result['keywords']=str(keywords)

result['uv']=str(dictionary[2])

result['frequency']=str(dictionary[3])

result['goods_id']=str('')

write_db2(result)

else:

for i in range(1,len(content_field)+1):

goods_id = content_field.xpath('li[%d]/@data-goods-id'%i)[0]

#return goods_id

result['rank']=str(dictionary[0])

# result['keywords']=str(keywords)

result['uv']=str(dictionary[2])

result['frequency']=str(dictionary[3])

result['goods_id']=str(goods_id)

write_db2(result)

if __name__ == "__main__":

#把密码也隐藏起来

conn=ibm_db.connect("DATABASE=aedw;HOSTNAME=miranda;PORT=50000;PROTOCOL=TCPIP;UID=miranda; PWD=miranda;", "", "")

#测试连接

try:

conn

logging.info("connect to DB2 successfully")

except:

logging.info("couldn't connect to DB2")

# #创建表

# sql_create='create table T_keywords_weekly_TEMP like V_keywords_weekly'

# stmt_create = ibm_db.exec_immediate(conn, sql_create)

# try:

# stmt_create

# logging.info("create table T_keywords_weekly_TEMP successfully")

# except:

# logging.info("couldn't create table T_keywords_weekly_TEMP")

# #插入表

# sql_insert="insert into T_keywords_weekly_TEMP select * from V_keywords_weekly where rank>=100"

# stmt_insert = ibm_db.exec_immediate(conn, sql_insert)

# try:

# stmt_insert

# logging.info("insert into T_keywords_weekly_TEMP successfully")

# except:

# logging.info("couldn't insert into table T_keywords_weekly_TEMP")

sql_select="select * from T_keywords_weekly_TEMP where rank>=162"

stmt_select = ibm_db.exec_immediate(conn, sql_select)

try:

stmt_select

logging.info("get the data from T_keywords_weekly_TEMP")

except:

logging.info("couldn't get the data from T_keywords_weekly_TEMP")

else:

dictionary = ibm_db.fetch_both(stmt_select)

while dictionary != False:

logging.info ('rank:'+str(dictionary[0])+ ' keywords:'+str(dictionary[1]))

get_id(dictionary)

dictionary = ibm_db.fetch_both(stmt_select) #这一句不能少啊

# 更新关键字

sql_update='''MERGE INTO T_KEYWORDS_weekly as tkmUSING T_keywords_weekly_TEMP as tkmtON tkm.K_RANK=tkmt.RANKand tkm.week_YEAR=year(current date)||'-'||WEEK_ISO(current date)WHEN MATCHEDTHEN UPDATE SET tkm.KEYWORDS=tkmt.KEYWORDSELSE IGNORE'''

stmt_update=ibm_db.exec_immediate(conn, sql_update)

try:

stmt_update

logging.info("update the keywords")

except:

logging.info("couldn't update the keywords")

# sql_drop="drop table T_keywords_weekly_TEMP"

# stmt_drop = ibm_db.exec_immediate(conn, sql_drop)

# try:

# stmt_drop

# logging.info("drop table T_keywords_weekly_TEMP successfully")

# except:

# logging.info("couldn't drop table T_keywords_weekly_TEMP")

#

# ibm_db.close(conn)

# logging.info("close the connect!")

# 配置发送邮件

sender = 'tangxin2@meilele.com'

receivers = ['tangxin2@meilele.com']

SUBJECT = "Successfully update T_KEYWORDS_weekly"

TEXT = '''Dear miranda,your python script of update T_KEYWORDS_weekly of last month have successed.may you have a good mood'''

message = """\From:%sTo:%sSubject:%s%s""" % (sender, ", ".join(receivers), SUBJECT, TEXT)

try:

smtpObj = smtplib.SMTP('mail.meilele.com', 25)

smtpObj.sendmail(sender, receivers, message)

logging.info("Successfully sent email")

except:

logging.info("Error: unable to send email" )

print('finish')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值