from selenium.webdriver.common.action_chains importActionChainsfrom pyquery importPyQuery as pqfrom selenium importwebdriverfrom selenium.webdriver.chrome.options importOptionsimporttime#抓取分类数据
deftianmao_catch_category():
driver= get_driver('', False)try:
url= 'https://www.tmall.com/?ali_trackid=2:mm_26632258_3504122_55934697:1609295236_235_1586302010&union_lens=recoveryid:1609295236_235_1586302010&clk1=3a059b6fd5d21a5e9086e711fdf3afe4&bxsign=tbkJxFfRkMJdwE3OwpP483v2+4G1PrzCDIDumBW7tv5QzQfc+xlm3i2oiRMn2bJl4qaPrxH6ekD1p3hgS1sBUJbM4REq9LyuFhLBITi5yXSBSs='driver.get(url)
time.sleep(10)#spans = doc("div[id='imgid']").find("div[class='imgpage']").find(
#"ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']")
#spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']")
spans=driver.find_elements_by_xpath("//ul[@class='normal-nav clearfix']/li")
isbreak=False
count1=0
count2=0
list1=[]for span inspans:#鼠标事件
ActionChains(driver).move_to_element(span).perform()
data_title= str(span.text).replace('/', '/').strip().replace('','')#删除类似\ue615 字符
ts = data_title.split(' ')if len(ts)==1:
list1.append(ts[0])elif len(ts)==2:
list1.append(ts[1])
time.sleep(3)
selenium_html= driver.execute_script("return document.documentElement.outerHTML")
doc=pq(selenium_html)
sub_spans= doc("div[class='content-con j_categoryContent']").find("div[class='pannel-con j_CategoryMenuPannel']").find("div[class^='pannel-']")print('\n')
index=0
netname= '天猫'
for sp insub_spans.items():
category_one=list1[index]
index+= 1two_item= sp.find("div[class='hot-word-con']").find("div[class='hot-word-line']")for ts intwo_item.items():
category_two= ts.find("div[class='line-title']").find("div[class='title-text']").text()
sps= ts.find("div[class='line-con']").find("a[class^='hot-word']")for sp insps.items():
category_three=sp.text()print(category_one, category_two, category_three)
db.saveCategory(netname, category_one, category_two, category_three)print('\n')exceptException as ex:print(ex)
driver.quit()#抓取分类数据
defjingdong_catch_category():
driver= get_driver('', False)#proxy_one = ip_read()
#driver = get_driver(proxy_one, False)
try:
url= 'https://www.jd.com/?cu=true&utm_source=baidu-pinzhuan&utm_medium=cpc&utm_campaign=t_288551095_baidupinzhuan&utm_term=0f3d30c8dba7459bb52f2eb5eba8ac7d_0_48ba7a220ee5462c97fc2d5f3691e5c5'driver.get(url)#selenium_html = driver.execute_script("return document.documentElement.outerHTML")
#doc = pq(selenium_html)
time.sleep(10)#spans = doc("div[id='imgid']").find("div[class='imgpage']").find(
#"ul[class^='imglist clearfix pageNum']").find("li[class='imgitem']")
#spans = doc("ul[class='normal-nav clearfix']").find("li[class^='j_MenuNav nav-item nav-item-']")
spans = driver.find_elements_by_xpath("//ul[@class='JS_navCtn cate_menu']/li[@class='cate_menu_item']")
list1=[]for span inspans:
ActionChains(driver).move_to_element(span).perform()
data_title= str(span.text).replace('/', '/').replace('/', '/').strip().replace('', '')print('data_title=',data_title)
list1.append(data_title)
time.sleep(3)
selenium_html= driver.execute_script("return document.documentElement.outerHTML")
doc=pq(selenium_html)
sub_spans= doc("div[id='J_popCtn']").find("div[class='cate_part clearfix']")print('\n')
index=0
netname= '京东'
for sp insub_spans.items():
category_one=list1[index]
two_item= sp.find("div[class='cate_part_col1']").find("div[class='cate_channel']").find("a[class='cate_channel_lk']")
index1=0
category_two=''
for ts intwo_item.items():
category_three=''
if index1==0:
category_two=str(ts.text())else:
category_three=str(ts.text())print(category_one, category_two, category_three)
db.saveCategory(netname, category_one, category_two, category_three)
index1+=1two_item= sp.find("div[class='cate_part_col1']").find("div[class='cate_detail']").find("dl[class^='cate_detail_item cate_detail_item']")
index1=0
category_two= ''
for ts intwo_item.items():
category_three= ''
if index1 ==0:
category_two= str(ts.find("dt[class='cate_detail_tit']").find("a[class='cate_detail_tit_lk']").text())else:
sps= ts.find("dd[class='cate_detail_con']").find("a[class='cate_detail_con_lk']")for sp insps.items():
category_three=str(sp.text())print(category_one, category_two, category_three)
db.saveCategory(netname, category_one, category_two, category_three)
index1+= 1index+= 1
print('\n')print(index)exceptException as ex:print(ex)
driver.quit()