python分类信息网_python商品分类信息

from selenium.webdriver.common.action_chains importActionChainsfrom pyquery importPyQuery as pqfrom selenium importwebdriverfrom selenium.webdriver.chrome.options importOptionsimporttime#抓取分类数据

deftianmao_catch_category():

driver= get_driver('', False)try:

url= 'https://www.tmall.com/?ali_trackid=2:mm_26632258_3504122_55934697:1609295236_235_1586302010&union_lens=recoveryid:1609295236_235_1586302010&clk1=3a059b6fd5d21a5e9086e711fdf3afe4&bxsign=tbkJxFfRkMJdwE3OwpP483v2+4G1PrzCDIDumBW7tv5QzQfc+xlm3i2oiRMn2bJl4qaPrxH6ekD1p3hgS1sBUJbM4REq9LyuFhLBITi5yXSBSs='driver.get(url)

time.sleep(10)#spans = doc("div[id='imgid']").find("div[class='imgpage']").find(

#"ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']")

#spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']")

spans=driver.find_elements_by_xpath("//ul[@class='normal-nav clearfix']/li")

isbreak=False

count1=0

count2=0

list1=[]for span inspans:#鼠标事件

ActionChains(driver).move_to_element(span).perform()

data_title= str(span.text).replace('/', '/').strip().replace('','')#删除类似\ue615 字符

ts = data_title.split(' ')if len(ts)==1:

list1.append(ts[0])elif len(ts)==2:

list1.append(ts[1])

time.sleep(3)

selenium_html= driver.execute_script("return document.documentElement.outerHTML")

doc=pq(selenium_html)

sub_spans= doc("div[class='content-con j_categoryContent']").find("div[class='pannel-con j_CategoryMenuPannel']").find("div[class^='pannel-']")print('\n')

index=0

netname= '天猫'

for sp insub_spans.items():

category_one=list1[index]

index+= 1two_item= sp.find("div[class='hot-word-con']").find("div[class='hot-word-line']")for ts intwo_item.items():

category_two= ts.find("div[class='line-title']").find("div[class='title-text']").text()

sps= ts.find("div[class='line-con']").find("a[class^='hot-word']")for sp insps.items():

category_three=sp.text()print(category_one, category_two, category_three)

db.saveCategory(netname, category_one, category_two, category_three)print('\n')exceptException as ex:print(ex)

driver.quit()#抓取分类数据

defjingdong_catch_category():

driver= get_driver('', False)#proxy_one = ip_read()

#driver = get_driver(proxy_one, False)

try:

url= 'https://www.jd.com/?cu=true&utm_source=baidu-pinzhuan&utm_medium=cpc&utm_campaign=t_288551095_baidupinzhuan&utm_term=0f3d30c8dba7459bb52f2eb5eba8ac7d_0_48ba7a220ee5462c97fc2d5f3691e5c5'driver.get(url)#selenium_html = driver.execute_script("return document.documentElement.outerHTML")

#doc = pq(selenium_html)

time.sleep(10)#spans = doc("div[id='imgid']").find("div[class='imgpage']").find(

#"ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']")

#spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']")

spans = driver.find_elements_by_xpath("//ul[@class='JS_navCtn cate_menu']/li[@class='cate_menu_item']")

list1=[]for span inspans:

ActionChains(driver).move_to_element(span).perform()

data_title= str(span.text).replace('/', '/').replace('/', '/').strip().replace('', '')print('data_title=',data_title)

list1.append(data_title)

time.sleep(3)

selenium_html= driver.execute_script("return document.documentElement.outerHTML")

doc=pq(selenium_html)

sub_spans= doc("div[id='J_popCtn']").find("div[class='cate_part clearfix']")print('\n')

index=0

netname= '京东'

for sp insub_spans.items():

category_one=list1[index]

two_item= sp.find("div[class='cate_part_col1']").find("div[class='cate_channel']").find("a[class='cate_channel_lk']")

index1=0

category_two=''

for ts intwo_item.items():

category_three=''

if index1==0:

category_two=str(ts.text())else:

category_three=str(ts.text())print(category_one, category_two, category_three)

db.saveCategory(netname, category_one, category_two, category_three)

index1+=1two_item= sp.find("div[class='cate_part_col1']").find("div[class='cate_detail']").find("dl[class^='cate_detail_item cate_detail_item']")

index1=0

category_two= ''

for ts intwo_item.items():

category_three= ''

if index1 ==0:

category_two= str(ts.find("dt[class='cate_detail_tit']").find("a[class='cate_detail_tit_lk']").text())else:

sps= ts.find("dd[class='cate_detail_con']").find("a[class='cate_detail_con_lk']")for sp insps.items():

category_three=str(sp.text())print(category_one, category_two, category_three)

db.saveCategory(netname, category_one, category_two, category_three)

index1+= 1index+= 1

print('\n')print(index)exceptException as ex:print(ex)

driver.quit()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值