电子发票通常是以pdf文件存在,各大电商几乎都提供电子发票,如京东、淘宝(天猫)、苏宁易购、携程、中国联通、电信、移动等平台。那么,我们如何以编程方式爬取这些平台的电子发票呢?好了,这里我直接上代码供参考,实际上,经测试,各大电商平台的电子发票都是可以爬取的。欢迎加qq283335746共同探讨以编程方式爬虫抓取数据信息。
/// <summary>
/// 中国电信(189.cn)
/// </summary>
public class ChinaTelecom
{
private ChinaTelecom() { }
public ChinaTelecom(string userId, string cookieAppend)
{
this._userId = userId;
this._userCookie = cookieAppend;
this._relationUserId = Common.CookieFilter(_userCookie, string.Format(@"\s*{0}\s*=\s*(.|\n)*?;", "userId"));
GetCityDomain();
this._thirdPartyRequest = new ThirdPartyRequest(cookieAppend, ThirdPartyOptions.ChinaTelecom);
GetYearMonths();
this.OrderInvoices = new List<DownloadInvoiceInfo>();
this._invoiceBll = new ThirdPartyBll(_userId, _relationUserId, ThirdPartyOptions.ChinaTelecom);
this._client = new NetClient(BaseUrl);
}
internal static string CityHost = "http://www.189.cn{0}";
internal static string BaseUrl = "http://{0}.189.cn";
internal const string QueryInvoicesFirstUrl = "{0}/pages/selfservice/usercomplaintsinfo/queryNetInvoice.action?pageDataNum=10";
internal const string QueryInvoicesUrl = "{0}/pages/selfservice/usercomplaintsinfo/queryNetInvoice.action?pageDataNum=12&acctMonth={1}";
internal const string DownloadInvoiceUrl = "{0}/pages/selfservice/usercomplaintsinfo/DownloadNetInvoice.action?index=1";
private readonly string _userId = string.Empty;
private readonly string _userCookie = string.Empty;
private readonly string _relationUserId = string.Empty;
private readonly NetClient _client;
private readonly ThirdPartyRequest _thirdPartyRequest;
private readonly ThirdPartyBll _invoiceBll;
private static List<string> YearMonths { get; set; }
private List<DownloadInvoiceInfo> OrderInvoices { get; set; }
/// <summary>
/// 发票处理入口
/// </summary>
/// <returns></returns>
public async Task ExecuteInvoiceAsync()
{
if (Log.IsDebugEnabled) Log.Debug("ChinaTelecom.ExecuteInvoiceAsync is starting,UserId:{0},RelationUserId:{1}", _userId, _relationUserId);
//下载并解析出订单发票信息
await GetOrderInvoicesAsync();
//保存得到的发票信息集
await _invoiceBll.SaveDownloadInvoice(OrderInvoices);
}
/// <summary>
/// 获取包含订单发票信息集
/// </summary>
/// <returns></returns>
private async Task GetOrderInvoicesAsync()
{
foreach (var yearMonth in YearMonths)
{
var index = YearMonths.FindIndex(m=>m.Equals(yearMonth));
var queryInvoicesUrl = index == 0 ? string.Format(QueryInvoicesFirstUrl,BaseUrl) : string.Format(QueryInvoicesUrl, BaseUrl, yearMonth);
var request = _thirdPartyRequest.CreateRequest(queryInvoicesUrl, index == 0 ? "diagnostics://4/" : string.Format(QueryInvoicesUrl, BaseUrl, YearMonths[index - 1]));
var response = await _client.ExecuteAsync(request);
if (response.ContentLength == 0) return;
var downloadInvoiceInfo = await DownloadInvoiceAsync(response.ResponseUri.ToString());
if (downloadInvoiceInfo == null) continue;
OrderInvoices.Add(downloadInvoiceInfo);
}
}
/// <summary>
/// 下载并解析出电子发票信息
/// </summary>
/// <param name="refererUrl"></param>
/// <returns></returns>
private async Task<DownloadInvoiceInfo> DownloadInvoiceAsync(string refererUrl)
{
var request = _thirdPartyRequest.CreateRequest(string.Format(DownloadInvoiceUrl, BaseUrl), refererUrl);
var response = await _client.ExecuteAsync(request);
if (response.ContentLength == 0) return null;
var tempPath = FileHelper.GetTempFileName(string.Format("{0}.pdf", Guid.NewGuid()));
await FileHelper.Save(response.RawBytes, tempPath);
var invoiceInfo = await _invoiceBll.GetInvoiceInfoByFile(tempPath);
if (string.IsNullOrEmpty(invoiceInfo.InvoiceCode) || string.IsNullOrEmpty(invoiceInfo.InvoiceNumber)) return null;
//解析出发票信息后,将临时文件移至正式存储路径
var filePath = FileHelper.GetFilePath("ChinaTelecom", string.Format("{0}-{1}.pdf", invoiceInfo.InvoiceCode, invoiceInfo.InvoiceNumber));
FileHelper.MoveFile(tempPath, filePath);
FileHelper.MoveFile(tempPath.Replace(".pdf", ".jpg"), filePath.Replace(".pdf", ".jpg"));
invoiceInfo.FileUrl = FileHelper.ToVirtualUrl(filePath);
invoiceInfo.Picture = invoiceInfo.FileUrl.Replace(".pdf", ".jpg");
var downloadInvoiceInfo = new DownloadInvoiceInfo { FilePath = filePath,InvoiceInfo = invoiceInfo,RefererUrl = refererUrl,DownloadUrl = request.Resource };
return downloadInvoiceInfo;
}
/// <summary>
/// 从Cookie中解析出cityCode,并得到对应的CityDomain
/// </summary>
private void GetCityDomain()
{
var cityCode = Common.CookieFilter(_userCookie, string.Format(@"\s*{0}\s*=\s*(.|\n)*?;", "cityCode"));
if (string.IsNullOrEmpty(cityCode))
{
if (Log.IsWarnEnabled) Log.Warn("ChinaTelecom.GetCityDomain,cityCode is null");
BaseUrl = string.Format(BaseUrl, "www");
return;
}
CityHost = string.Format(CityHost, "/" + cityCode + "/");
BaseUrl = string.Format(BaseUrl, cityCode);
//var items = _userCookie.Split(new char[] {';'}, StringSplitOptions.RemoveEmptyEntries);
//var cityCodeItem = items.FirstOrDefault(m => m.Contains("cityCode"));
//if (cityCodeItem == null)
//{
// if(Log.IsWarnEnabled) Log.Warn("ChinaTelecom.GetCityDomain,cityCode is null");
// BaseUrl = string.Format(BaseUrl, "www");
// return;
//}
//var arr = cityCodeItem.Split(new char[] {'='}, StringSplitOptions.RemoveEmptyEntries);
//BaseUrl = string.Format(BaseUrl, arr.Length < 2 ? "www" : arr[1].Trim());
}
/// <summary>
/// 电信电子发票页支持查询的年月时间段选择条件
/// </summary>
private void GetYearMonths()
{
var currentTime = DateTime.Now.AddMonths(-1);
if (YearMonths == null) YearMonths = new List<string>();
else
{
if(YearMonths.Any(m=>m == currentTime.ToString("yyyyMM"))) return;
YearMonths.Clear();
}
while (currentTime > GlobalConfig.MinValidateInvoiceDate && YearMonths.Count < 12)
{
YearMonths.Add(currentTime.ToString("yyyyMM"));
currentTime = currentTime.AddMonths(-1);
}
}
}