1.请求接口:
2.requests请求:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import json
import time
import requests
class REWQ:
def __init__(self):
self.url = 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp'
self.headers = {
'Host': 'www.dqzsteel.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Origin': 'http://www.dqzsteel.com',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=1657870616',
}
def test(self):
timetemp = int(time.time())
self.headers[
'Cookie'] = 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=%s' % (
str(timetemp))
page = 1
req_body = 'ORDER_BY_FIELD=ORDER_NUMBER&ORDER_BY_DIRECTION=ASC&RECORD_COUNT=924&PAGE_COUNT=10&PAGINAL_COUNT=100&CURRENT_PAGE=%s' % (
page)
res = requests.post(url=self.url,data=req_body,headers=self.headers)
print(res.text)
pass
if __name__ == '__main__':
ob = REWQ()
ob.test()
成功响应
3.scrapy请求
import json
import time
from urllib.parse import urlencode
import scrapy
from utils.robot import send_msg
class SalesmaninfospiderSpider(scrapy.Spider):
name = 'SalesmanInfoSpider'
def __init__(self):
self.url = 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp'
self.headers = {
'Host': 'www.dqzsteel.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Origin': 'http://www.dqzsteel.com',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=1657870616',
}
def start_requests(self):
timetemp = int(time.time())
self.headers['Cookie'] = 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=%s'%(str(timetemp))
page = 1
req_body = 'ORDER_BY_FIELD=ORDER_NUMBER&ORDER_BY_DIRECTION=ASC&RECORD_COUNT=924&PAGE_COUNT=10&PAGINAL_COUNT=100&CURRENT_PAGE=%s'%(page)
yield scrapy.Request(url=self.url, method="POST", body=req_body, headers=self.headers,#cookies=cookies,
meta={"page": page},
callback=self.get_product, dont_filter=True)
def get_product(self, response):
"""
获取产品id
"""
page = response.meta.get("page")
try:
print(response.request.headers)
res_data = response.text
print(res_data)
pass
except Exception as e:
send_msg(e)
pass
4.请求成功,但是返回的参数不一样
scrapy传递cookie的参数与requests的不一样,改为以下即可:
cookies = {i.split("=")[0]: i.split("=")[1] for i in str(self.headers['Cookie']).split("; ")}
yield scrapy.Request(url=self.url, method="POST", body=req_body, headers=self.headers,cookies=cookies,
meta={"page": page},
callback=self.get_product, dont_filter=True)
成功响应一样的页面:
5.改正后,scrapy完整代码:
import os
import time
import scrapy
import pandas as pd
from utils.robot import send_msg
class SalesmaninfospiderSpider(scrapy.Spider):
name = 'SalesmanInfoSpider'
def __init__(self):
self.url = 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp'
self.headers = {
'Host': 'www.dqzsteel.com',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Origin': 'http://www.dqzsteel.com',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': 'http://www.dqzsteel.com/Software/Salesman/SpotPrice/ProducerList.jsp',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=1657870616',
}
def start_requests(self):
timetemp = int(time.time())
self.headers['Cookie'] = 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=%s'%(str(timetemp))
page = 1
req_body = 'ORDER_BY_FIELD=ORDER_NUMBER&ORDER_BY_DIRECTION=ASC&RECORD_COUNT=927&PAGE_COUNT=10&PAGINAL_COUNT=100&CURRENT_PAGE=%s'%(page)
cookies = {i.split("=")[0]: i.split("=")[1] for i in str(self.headers['Cookie']).split("; ")}
yield scrapy.Request(url=self.url, method="POST", body=req_body, headers=self.headers,cookies=cookies,
meta={"page": page},
callback=self.get_product, dont_filter=True)
def get_product(self, response):
"""
获取产品id
"""
page = response.meta.get("page")
print("正在爬取第",str(page),"页!")
try:
# print(response.request.headers)
res_data = response.text
tables = pd.read_html(res_data.replace('<br>',' '))
# print(tables)
data_len = len(tables)
if data_len > 1:
df1 = tables[0]
# 删除n列
df1 = df1.drop([0, 2, 4, 6, 7, 8, 9, 10], axis=1)
# 重建索引
df1.reset_index(drop=True)
path = os.path.join(os.getcwd(), 'excel', "1.csv")
# 删除全部为空的行(只要有空值就删除)
df1 = df1.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
print(df1)
df1.to_csv(path, mode='a')
#下一页
if page < 10:
page += 1
timetemp = int(time.time())
self.headers[
'Cookie'] = 'Hm_lvt_87ef776965ff233cab5e4054821b5e94=1657867719; SALESMAN_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; factoryLogin_rand=ADEAOQA0; salesmanLogin_rand=ADgAMgAw; SESSION_ID=ADEANgA1ADcAOAA2ADkAMQAwADAANAAyADI; SESSION_ROLE_TYPE=ThpSoVRY; SESSION_NAME=jCJRdpdZ; SESSION_MOBILE=ADEAOAA5ADIAMgAzADQAMQA2ADIANB; SESSION_SALESMAN_ID=ADgAMwAzADUAMB; SESSION_COMPANY_NAME=; SESSION_LOGIN_TYPE=U1V1KGI3Y6dSNC; SESSION_CONTROL=AFAAQQBTAFD; SESSION_LOGIN_MESSAGE=; JSESSIONID=9Y8zvRZRGZgnnf1ynJbyKtwFrj96TpvfVHLFwPgpHDGnnXSvFpDc!1557300612!719156084; Hm_lpvt_87ef776965ff233cab5e4054821b5e94=%s' % (
str(timetemp))
req_body = 'ORDER_BY_FIELD=ORDER_NUMBER&ORDER_BY_DIRECTION=ASC&RECORD_COUNT=927&PAGE_COUNT=10&PAGINAL_COUNT=100&CURRENT_PAGE=%s' % (
page)
cookies = {i.split("=")[0]: i.split("=")[1] for i in str(self.headers['Cookie']).split("; ")}
yield scrapy.Request(url=self.url, method="POST", body=req_body, headers=self.headers, cookies=cookies,
meta={"page": page},
callback=self.get_product, dont_filter=True)
else:
print("爬取完成!")
except Exception as e:
send_msg(e)
pass