对于淘宝,我们应该是很熟悉的,我们要用爬虫爬取淘宝商品的信息,我们一般是要获取每一页的信息,然后进行翻页操作。
淘宝对于每一页的商品信息的使用了异步加载的方法,对于翻页进行参数加密处理。
有些参数信息是使用javascript动态算法生成,如果我们要了解其生成原理,我们要去解读大量的js代码,这样效率很低,所以我们可以使用自动化(selenium)就可以很轻松的解决上面的问题。
代码实现:
from selenium import webdriver
from bs4 import BeautifulSoup
import time,random
import pandas as pd
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
a=[]
def url_open(url):
driver=webdriver.Firefox()
driver.implicitly_wait(8)
driver.get(url)
driver.find_element_by_id('q').clear()
content='电脑'
driver.implicitly_wait(3)
try:
driver.find_element_by_id('q').send_keys(content)
except Exception as e:
driver.find_element_by_id('q').send_keys(Keys.ENTER)
driver.find_element_by_id('q').send_keys(content)
driver.find_element_by_class_name('submit').click()
#driver.find_element_by_id('q').send_keys(Keys.ENTER)
#由于淘宝页面跳转是打开新的页面,所以我们必须获得新页面的句柄
driver.switch_to_window(driver.window_handles[1])
while(True):
for i in range(1,8):
js='window.scrollTo(1,'+str(1200*i)+')'
driver.execute_script(js)
time.sleep(0.1)
#time.sleep(4)
#driver.implicitly_wait(5)
html=BeautifulSoup(driver.page_source,'html.parser')
soup1=html.select('.info .title')
soup2=html.select('.imgLink img')
soup3=html.select('.shopNick')
soup4=html.select('span[class="pricedetail"] strong')
soup5=html.select('.payNum')
soup6=html.select('.dsr-info-list')
page_num=int(html.select('#Jumper')[0].get('value'))
for i,j,k,l,m,n in zip(soup1,soup2,soup3,soup4,soup5,soup6):
b=[]
print(i.text)
print(j['src'])
print(k.text.strip())
print(l.text.strip())
print(m.text.strip())
print(n.text.strip())
b.append(i.text)
b.append(j['src'])
b.append(k.text.strip())
b.append(l.text.strip())
b.append(m.text.strip())
b.append(n.text.strip().split(':')[1].split('服')[0])
b.append(n.text.strip().split(':')[2].split('发')[0])
b.append(n.text.strip().split(':')[3])
a.append(b)
print('---------------------------------------')
print('====================爬取结束====================')
try:
driver.find_element_by_id('Jumper').clear()
driver.find_element_by_id('Jumper').send_Keys(int(page_num)+1)
driver.find_element_by_class_name('pageConfirm').click()
except Exception as e:
print(e)
driver.find_element_by_id('Jumper').clear()
driver.find_element_by_id('Jumper').send_Keys(int(page_num)+1)
driver.find_element_by_class_name('pageConfirm').click()
def get_message():
df=pd.DataFrame(a,columns=['用品名称','图片地址','商家名称','价格','销售量','描述','服务','物流'])
df.to_csv('10.csv')
if __name__=='__main__':
url_open('http://uland.taobao.com/sem/tbsearch?keyword=taoba&refpid=mm_26632360_8858797_29866178&clk1=b028d03d5b8e957a5b072b353603aa66&upsid=b028d03d5b8e957a5b072b353603aa66')
get_message()