京东商品的信息是比较好爬取的,思路如下:
1.因为京东商品是有JavaScript渲染的,所以可以用selenium库来获取商品的源代码
2.获取了商品的源代码后,用正则表达式库(re)和著名的“美丽的汤”(BeautifulSoup)库来解析所需要的商品的属性,比如商品名称、价格、评价数
3.把解析后的信息保存到csv文件,用pandas库
具体的代码如下:
#爬取京东手机信息
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import re
import pandas as pd
options = webdriver.ChromeOptions()
# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
#停止加载图片
options.add_experimental_option("prefs", {
"profile.managed_default_content_settings.images": 2})
driver = webdriver.Chrome(options=options)
driver.maximize_window()
def get_detil(url):
detil_list=[]
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'cookie': '__jdu=1503776177; shshshfpa=fc731dcb-bbe0-5ef8-a758-feb8361f1279-1558793417; shshshfpb=jXoIibDZ2Cg1j2c7AzOnLpQ%3D%3D; unpl=V2_ZzNtbUEFRhV1Wk8Dch1ZAGIAFl0RAxcWc1gTVi5OXQVnBhFdclRCFX0URlVnGlgUZAEZXkpcQBNFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsdXQdhBRVdRFFzJXI4dmR%2fH1wBZwQiXHJWc1chVERScxldSGcHE19EUUQVcw52VUsa; user-key=72ca3dde-5dc8-457b-a8cc-732a9ad2944e; cn=0; PCSYCityID=CN_500000_500100_0; areaId=4; ipLoc-djd=4-113-9786-0; __jdv=122270672|baidu|-|organic|%25E7%2588%25AC%25E5%258F%2596%25E4%25BA%25AC%25E4%25B8%259C%25E5%2595%2586%25E5%2593%2581%25E4%25BF%25A1%25E6%2581%25AF|1573982428458; mt_xid=V2_52007VwMWU19eVF0fTx9sV28ARwcJWFBGSxlJVRliAhtWQVAAD09VSVQMZwUVW11RBlsYeRpdBW8fElJBW1NLHksSXAZsAhdiX2hSahZKGlQCbwUWU21YVF4b; shshshfp=502935447455162f98afcb9bb4fbd4fc; shshshsID=cb05e3feb787fd0b6a52a993c47979a2_12_1574193435659; __jda=122270672.1503776177.1546445332.1574186022.1574190631.44; __jdb=122270672.12.1503776177|44.1574190631; __jdc=122270672; 3AB9D23F7A4B3C9B=F3DVZOZZIY4HWG2IDIZMN2EKWAM7OPZYR7EZBWT5HFZGUUV7UQTHSXYEW6A55TWAQVO3KOVJP7G64CNVJK4ABS4GCQ'
}
r=requests.get(url,headers=headers)
soup=BeautifulSoup(r.text,'html.parser')
all_detil=soup.find_all('ul',class_="parameter2 p-parameter-list")
good_weight=re.findall('(?<=商品毛重:)(.+?)(?=</li>)',str(all_detil))
good_cpu=re.findall('(?<=CPU型号:)(.+?)(?=</li>)',str(