想要爬取一份京东商城的商品数据用来做数据分析
使用selenium较稳定,可以绕过很多反爬虫
但是使用selenium效率较低,而且通过循环翻页方法会存在数据重复
直接上代码
import time
import urllib.parse
import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
def get_url(keyword, i):
# 转义
keyword = urllib.parse.quote(keyword)
base_url = 'https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&bs=1&page={}&s=61&click=0'
page = str((i*2)-1)
# 拼接
new_url = base_url.format(keyword, page)
# print(new_url)
return new_url
def get_date(url):
# 运行过程中不显示浏览器
# 需要传入参数chrome_options=chrome_options
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
time.sleep(3)
# 滑动页面
for i in range(3):
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
ActionChains(driver).key_up(Keys.DOWN).perform()
time.sleep(2)
time.sleep(3)
# 获得源码
html = driver.page_source
driver.close()
return html
def parse_date(html):
soup = BeautifulSoup(html, 'lxml')
all_content = soup.select('#J_goodsList > ul > li')
for content in all_content:
# 价格
p_price = content.find('div', class_="p-price")
price = p_price.strong.i.string
# 商品名
p_name = content.find('div', class_="p-name")
name = p_name.a.em.get_text()
# 店名
# 商城广告推荐位没有店名,需要跳过错误
try:
p_shop = content.find('div', class_='p-shop')
shop = p_shop.span.a.string
except:
continue
# 评论数
p_commit = content.find('div', class_="p-commit")
commit = p_commit.strong.a.string
item = {
'店名': shop,
'商品名': name,
'价格': price,
'评论数': commit
}
yield item
def save_date(keyword, item):
# 先转换为df数据
good_df = pd.DataFrame(item)
# 写入
good_df.to_csv((keyword + '.csv'), mode='a', encoding='utf-8-sig')
def main():
keyword = input('输入你要获取数据的商品名:')
for i in range(1, 101):
url = get_url(keyword, i)
html = get_date(url)
item = parse_date(html)
save_date(keyword, item)
print('第%d页已爬取完' % i)
if __name__ == '__main__':
main()
爬取效果: 通过Excel去重之后有4000多件商品
其他商品的也可爬取下来