开头先来一波数据目标成果:
接着就是很香的源码了:
复制代码运行数据到手
# -*- coding: utf-8 -*-
import re
import requests
import time
import datetime
import csv
#http://category.dangdang.com/pg2-cid4001049-lp166-hp168.html
# star_url = 'http://category.dangdang.com/cid4001049-lp0-hp5.html'
def cn_url():
u1 = 'http://category.dangdang.com/'
u2 = 'cid4001049-lp'
n = 0
m = n+2
for i in range(0,1250):
url = str(u1)+str(u2)+str(n)+'-hp'+str(m)+'.html'
print(url)
n=m
m = m+2
try:
response = requests.get(url=url,timeout=3)
if response.status_code == 200:
# response.encoding = 'utf-8'
data = response.text.replace('\n','').replace('\t','').replace(' ','').replace('\r','')
print(len(data))
print(data)
except Exception as r:
print(r)
continue
pat = '共<emclass="b">(.*?)</em>件商品</span>'
num = re.compile(pat).findall(data)
# print(num)
if len(num) > 0:
p = int(num[0]) / 48
p = int(p) + 2
for w in range(1, p):
print(w)
url = str(u1)+'pg'+str(w)+'-'+str(u2)+str(n)+'-hp'+str(m)+'.html'
print(url)
try:
response = requests.get(url=url,timeout=3)
if response.status_code == 200:
data = response.text.replace('\n', '').replace('\t', '').replace(' ', '').replace('\r', '')
time.sleep(2)
except Exception as r:
print(r)
continue
pat = 'dd_name="单品图片"(.*?)加入购物车<'
goods = re.compile(pat).findall(data)
print(goods)
if len(goods) > 0:
for gd in goods:
pat = "alt='(.*?)'"
sps = re.compile(pat).findall(gd)
if len(sps) > 0:
sp_name = sps[0]
print(sp_name)
else:
print('capture goods name err')
pat = '>¥(.*?)</span></p><'
price = re.compile(pat).findall(gd)
if len(price) > 0:
price = price[0]
print(price)
else:
print('capture price is err')
pat = 'target="_blank"title="(.*?)">'
shops_name = re.compile(pat).findall(gd)
if len(shops_name) > 0:
shops_name = shops_name[0]
print(shops_name)
else:
print('capture shops_name is err')
pat = 'dd_name="单品评论"(.*?)</p>'
pls = re.compile(pat).findall(gd)
if len(pls) > 0:
pls = pls[0]
pat = '>(.*?)</a>'
comments = re.compile(pat).findall(pls)
if len(comments) > 0:
comments = comments[0]
print(comments)
else:
print('capture comments2 is err')
else:
print('capture comments is err')
pat = 'href="http://product.dangdang.com(.*?).html"'
url_id = re.compile(pat).findall(gd)
if len(url_id) > 0:
goods_url = 'http://product.dangdang.com' + url_id[0] + '.html'
print(goods_url)
else:
print('capture url_goods is err')
nowTime = datetime.datetime.now().strftime('%Y-%m-%d')
print(nowTime)
source = '当当网'
try:
filename = '20200416当当网手机配件数据'
with open(str(filename) + '.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([sp_name, price, shops_name, goods_url,source, nowTime])
except Exception as r:
print(r)
if __name__ == '__main__':
cn_url()
至此当当网爬虫完成,香就完事了