#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import re
from lxml import etree
import html
CITY = 'city'
USERNAME = 'name'
PASSWORD = 'password'
PAGE_START = 1
PAGE_END = 1100
# 登录缓存
def getSession(username, password):
# 登录页
LOGIN_URL = 'http://www.test.com/index.php?ajax=1'
# 账号密码
DATA = {"com":'com_passport',"method":'dologin',"ID":username,"PWD":password,"checkbox":'on'}
# 浏览器
HEADERS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
# 保存登录参数
ROOM_SESSION = requests.Session()
ROOM_SESSION.post(LOGIN_URL,data=DATA,headers=HEADERS)
return ROOM_SESSION
# 获取列表
def getPageUrl(num):
PAGE_BASE = 'http://www.test.com/index.php?method=index&app=order&page='
return PAGE_BASE + str(num)
# 保存文件
def saveFile(text, name):
fo = open(name, "w")
fo.write(text)
fo.close()
# 错误记录
def addLog(type, text):
ERR_FILE = CITY + '_' + type + '.txt'
ferror = open(ERR_FILE, "a")
ferror.write(text)
ferror.write(" ")
ferror.close()
## 获取详情
def getOrder(text):
order_text = ''
html_text = etree.HTML(text)
contents = html_text.xpath("//div[@class='order-content']")
# 遍历订单
for index in range(len(contents)) :
div_str = etree.tostring(contents[index]).decode('utf-8')
div_str = html.unescape(div_str)
order_text += div_str
order_text += " "
return order_text
##########################################################
# 城市文件
fcity = open(CITY + '_index.html', "w")
fcity.write(" ")
# 登录
SESSION =getSession(USERNAME, PASSWORD)
# 循环
for num in range(PAGE_START, PAGE_END):
page = str(num)
PAGE_FILE = CITY + '/page_' + page.zfill(5) + '.html'
PAGE_URL = getPageUrl(page)
PAGE_RES = SESSION.get(PAGE_URL)
# 日志
addLog('log', PAGE_URL)
print(PAGE_URL)
print(PAGE_RES.status_code)
# 获取页面
if(PAGE_RES.status_code!=200):
addLog('error', '获取第'+ page +'页失败')
else:
addLog('success', '获取第'+ page +'页成功')
# 保存文件
PAGE_TEXT = PAGE_RES.text
saveFile(PAGE_TEXT,PAGE_FILE)
# 获取订单
ORDER_TEXT = getOrder(PAGE_TEXT)
fcity.write(" 第" + page + "页 ")
fcity.write(ORDER_TEXT)
if(ORDER_TEXT == ''):
fcity.write("获取内容失败")
addLog('error', '获取第'+ page +'页详情失败')