import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
def get_data(text):
soup=BeautifulSoup(text,'html.parser')
CPU=[]
hz=[]
screen=[]
battary=[]
camera=[]
memary=[]
for i in soup.find('div',class_="list-box").find_all('div',class_="pro-intro"):
listr=[]
for j in i.find('ul').find_all('li'):
listr.append(j.span.string)
if 'CPU型号:' in listr:
for l in i.find('ul').find_all('li'):
if l.span.string=='CPU型号:':
CPU.append(l['title'])
else:
CPU.append('无')
if 'CPU频率:' in listr:
for l in i.find('ul').find_all('li'):
if l.span.string=='CPU频率:':
hz.append(l['title'])
else:
hz.append('无')
if '主屏尺寸:' in listr:
for l in i.find('ul').find_all('li'):
if l.span.string=='主屏尺寸:':
screen.append(l['title'])
else:
screen.append('无')
if '电池容量:' in listr:
for l in i.find('ul').find_all('li'):
if l.span.string=='电池容量:':
battary.append(l['title'])
else:
battary.append('无')
if '后置摄像头:' in listr:
for l in i.find('ul').find_all('li'):
if l.span.string=='后置摄像头:':
camera.append(l['title'])
else:
camera.append('无')
if 'RAM容量:' in listr:
for l in i.find('ul').find_all('li'):
if l.span.string=='RAM容量:':
memary.append(l['title'])
else:
memary.append('无')
post=soup.select('div.list-box div.pro-intro h3 a')
name=list(map(lambda x:x.text.strip(),post))
price=[]
for n in soup.find('div',class_="list-box").find_all('div',class_="price-box"):
try:
pr=n.find('b',class_="price-type").string
price.append(pr)
except:
price.append('无')
date=[]
for m in soup.find('div',class_="list-box").find_all('div',class_="price-box"):
try:
da=m.find('span',class_="date").string
date.append(da)
except:
date.append('无')
dic={'名称':name,'CPU型号':CPU,'电池容量':battary,'CPU频率':hz,'主屏尺寸':screen,'后置摄像头':camera,'RAM容量':memary,'价格':price,'发布时间':date}
return dic
listL=[]
for p in range(1,41):
head={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'}
url='http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_{0}.html'.format(p)
html=requests.get(url,headers=head)
html.encoding='gbk'
data=get_data(html.text)
df=pd.DataFrame(data)
listL.append(df)
Total=pd.concat(listL,ignore_index=True)
print('---完成了第',p,'页,爬取了',len(data['名称']),'条,累计',len(Total),'条')
time.sleep(3)
Total.to_csv(r'E:\Python 学习\数据\中关村手机数据.csv',encoding='gbk')
print('爬取完成,共计有',len(Total),'条')