自己写的第一个dwr框架爬虫,不是太完美。 记录一下。
import requests
import time
class Lyspider:
初始化方法
def init(self):
self.headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36’
}
self.session=requests.Session()
self.session.headers.update(self.headers)
获取数据方法
def getdata(self):
lis=[]
for i in range(1,2):
data = {
‘callCount’: ‘1’,
‘windowName’: ‘’,
‘c0-scriptName’: ‘MessageService’,
‘c0-methodName’: ‘queryContent’,
‘c0-id’: ‘0’,
‘c0-e1’: ‘string:’,
‘c0-e2’: ‘number:10’,
‘c0-e3’: ‘number:’+str(i),
‘c0-e4’: ‘number:1’,
‘c0-e5’: ‘string:’,
‘c0-e6’: ‘string:’,
‘c0-e7’: ‘string:’,
‘c0-e8’: ‘string:39’,
‘c0-e9’: ‘string:’,
‘c0-param0’: ‘Object_Object:{keyword:reference:c0-e1, pageCount:reference:c0-e2, curPage:reference:c0-e3, itemType:reference:c0-e4, startDate:reference:c0-e5, endDate:reference:c0-e6, title:reference:c0-e7, repeatroleid:reference:c0-e8, hurdleid:reference:c0-e9}’,
‘batchId’: str(i),
‘page’: ‘%2Frobot%2FpublicComments.html’,
‘httpSessionId’: ‘’,
‘scriptSessionId’: ‘98D2B1B674FA9B05BFF6C905F5345159’
}
res=self.session.post(‘http://gzhd.samr.gov.cn:8500/robot/dwr/call/plaincall/MessageService.queryContent.dwr’,data=data)
if res.status_code==200:
print(‘第%s页数据获取成功!’%i)
ori=res.text
lis.append(ori)
time.sleep(0.5)
else:
print(‘第%s页数据获取失败!’%i)
return lis
#数据处理
def process_data(self):
results=[]
result= self.getdata()
#字符串切片
for str in result:
str = str[int(str.find(’[’)) + 1:]
str = str[:str.rfind(’]’)]
str = str.replace(’},’, ‘}’)
str = str.replace(’,’,’\n’)
#分割每页留言,替换不需要部分
for i in str:
if i == ‘}’:
s1 = str[:int(str.find(i)) + 1]
s1 = s1.encode(‘utf8’).decode(‘unicode_escape’)
results.append(s1)
str = str[int(str.find(i)) + 1:]
return results
#数据保存
def save_data(self):
# 处理后的数据
resu = self.process_data()
for data in resu:
print(data+’\n’)
print(‘共%s条留言!’%len(resu))
spider=Lyspider()
spider.save_data()