导入第三方库
import requests
import json
import csv
导入头部文件以及URL和商品信息
url = f"http://www.xinfadi.com.cn/getPriceData.html"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Content-Length": "93",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Host": "www.xinfadi.com.cn",
"Origin": "http://www.xinfadi.com.cn",
"Referer": "http://www.xinfadi.com.cn/priceDetail.html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
} # 根据需要修改
data = {
"limit": "20",
"current": page,
"pubDateStartTime":"",
"pubDateEndTime":"",
'prodPcatid':1186,
"prodCatid": "",
"prodName": "",
} # 根据需要修改
对爬取信息进行处理以及异常的抛出
response = requests.post(url=url, headers=headers, data=data)
if response.status_code == 200:
try:
response_data = response.json()
products = response_data.get('list', []) # 假设每页数据在'list'键下
all_products.extend(products) # 将当前页的产品添加到总列表中
except json.JSONDecodeError:
print(f"Error decoding JSON for page {page}: {response.text}")
else:
print(f"Failed to fetch data for page {page}: {response.status_code}")
解析json
parsed_products = []
for product in all_products:
parsed_product = {
'品名': product.get('prodName', ''),
"最低价": product.get('lowPrice', ''),
'最高价': product.get('highPrice', ''),
'平均价': product.get('avgPrice', ''),
'规格': product.get('specInfo', ''),
'产地': product.get('place', ''),
'单位': product.get('unitInfo', ''),
'发布日期': product.get('pubDate', '')
}
parsed_products.append(parsed_product)
print(parsed_product)
return parsed_products
代码的运行以及文件的生成
if __name__ == '__main__':
num_pages = int(input('请输入爬取页数:'))
parsed_products = fetch_and_parse_products(num_pages)
# 如果需要,可以在这里对parsed_products进行进一步的处理或保存
filename = 'products.csv'
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=[
'品名', '最低价', '最高价', '平均价', '规格', '产地', '单位', '发布日期'
])
writer.writeheader() # 写入表头
for product in parsed_products:
writer.writerow(product) # 写入每一行数据
完整代码
import requests
import json
import csv
def fetch_and_parse_products(num_pages):
all_products = []
for page in range(1, num_pages + 1):
url = f"http://www.xinfadi.com.cn/getPriceData.html"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Content-Length": "93",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Host": "www.xinfadi.com.cn",
"Origin": "http://www.xinfadi.com.cn",
"Referer": "http://www.xinfadi.com.cn/priceDetail.html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"X-Requested-With": "XMLHttpRequest",
} # 根据需要修改
data = {
"limit": "20",
"current": page,
"pubDateStartTime":"",
"pubDateEndTime":"",
'prodPcatid':1186,
"prodCatid": "",
"prodName": "",
} # 根据需要修改
response = requests.post(url=url, headers=headers, data=data)
if response.status_code == 200:
try:
response_data = response.json()
products = response_data.get('list', []) # 假设每页数据在'list'键下
all_products.extend(products) # 将当前页的产品添加到总列表中
except json.JSONDecodeError:
print(f"Error decoding JSON for page {page}: {response.text}")
else:
print(f"Failed to fetch data for page {page}: {response.status_code}")
# 解析列表
parsed_products = []
for product in all_products:
parsed_product = {
'品名': product.get('prodName', ''),
"最低价": product.get('lowPrice', ''),
'最高价': product.get('highPrice', ''),
'平均价': product.get('avgPrice', ''),
'规格': product.get('specInfo', ''),
'产地': product.get('place', ''),
'单位': product.get('unitInfo', ''),
'发布日期': product.get('pubDate', '')
}
parsed_products.append(parsed_product)
print(parsed_product)
return parsed_products
if __name__ == '__main__':
num_pages = int(input('请输入爬取页数:'))
parsed_products = fetch_and_parse_products(num_pages)
# 如果需要,可以在这里对parsed_products进行进一步的处理或保存
filename = 'products.csv'
with open(filename, mode='w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=[
'品名', '最低价', '最高价', '平均价', '规格', '产地', '单位', '发布日期'
])
writer.writeheader() # 写入表头
for product in parsed_products:
writer.writerow(product) # 写入每一行数据
print("所有数据处理完成")
数据展示