爬虫-金采网数据(2018-11-19)

爬虫地址

http://www.cfcpn.com/plist/caigou?pageNo=1&kflag=0&keyword=&keywordType=&province=&city=&typeOne=&ptpTwo=,,

环境

  1. python3.6.5

爬虫代码

# -*- coding:utf-8*-
import csv
import os
import re
import time

import lxml
import xlrd as xlrd
import xlwt as xlwt
from lxml import etree
import requests
import sys
from xlutils.copy import copy

sys.getdefaultencoding()


def get_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # response.encoding = 'utf-8'
            html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'')
            # print(sys.getdefaultencoding())
            # print(html)
            return html
    except requests.ConnectionError:
        return None


def parse_page(html):
    pattern1 = '<.*?(href=".*?/\d+").*?'
    href_url = re.findall(pattern1, html, re.I)
    # print(href_url)
    url_list = []
    for url in href_url:
        url1 = url.replace('href=', 'http://www.cfcpn.com').replace('"', '')
        # print(url1)
        url_list.append(url1)
    return url_list


def get_detail_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html = response.content.decode('utf-8', 'ignore').replace(u'\xa9', u'').replace(u'\xa0', u'')
            # print(response.status_code)
            # print('--------------------------------')
            mytree = lxml.etree.HTML(html)
            return mytree
    except requests.ConnectionError:
        return None


def parse_detail_page(detail_html):
    item_name = ""
    purchase = ""
    item_type = ""
    money = ""
    date = ""
    company = ""
    p_list = detail_html.xpath('//*[@id="news_content"]/p')
    item_name_list = detail_html.xpath('//*[@id="news_head"]/p[1]//text()')
    for item_name_src in item_name_list:
        index = item_name_src.find('目')
        if (index == -1):
            index = item_name_src.find('案')
        item_type = item_name_src[index + 1:]
        if index < 1:
            item_name = item_name_src
        else:
            item_name = item_name_src[:index + 1]
    date_list = detail_html.xpath('//*[@id="news_head"]/p[2]//text()')

    for dt in date_list:
        date = dt[5:16]

    for p in p_list:
        p_content = p.xpath('.//text()')
        context = ""
        for text in p_content:
            context = context + text.strip().replace(u'\xa0', u'').replace(u'\xa5', u'').replace('\r\n', '')
        # print(context.strip("\n"))
        if '元' in context or '金额' in context:
            money = context + money
        elif '价格' in context:
            money = context + money
            # print(money)
        if '中标' in context:
            company = company + context
        elif '供应商' in context:
            company = company + context
        elif '中选单位' in context:
            company = company + context
        elif '公司' in context:
            company = company + context

        if '采购方式' in context:
            try:
                purchase = context.split(":")[1]
            except:
                purchase = context

    info_list = [item_name, item_type, purchase, company, money, date]
    return info_list


def write_data(sheet, row, lst):
    for data_infos in lst:
        j = 0
        for data in data_infos:
            sheet.write(row, j, data)
            j += 1
        row += 1


def save(file_name, data):
    if os.path.exists(file_name):
        # 打开excel
        rb = xlrd.open_workbook(file_name, formatting_info=True)
        # 用 xlrd 提供的方法获得现在已有的行数
        rn = rb.sheets()[0].nrows
        # 复制excel
        wb = copy(rb)
        # 从复制的excel文件中得到第一个sheet
        sheet = wb.get_sheet(0)
        # 向sheet中写入文件
        write_data(sheet, rn, data)
        # 删除原先的文件
        os.remove(file_name)
        # 保存
        wb.save(file_name)
    else:
        header = ['company_name', 'company_desc', 'company_type', 'card_type', 'activity_deadtime', 'company_address',
                  'company_phone', 'activity_info']
        book = xlwt.Workbook(encoding='utf-8')
        sheet = book.add_sheet('金采网')
        # 向 excel 中写入表头
        for h in range(len(header)):
            sheet.write(0, h, header[h])
        # 向sheet中写入内容
        write_data(sheet, 1, data)
        book.save(file_name)


def main():
    print('*' * 80)
    print('\t\t\t\t金采网数据下载')
    print('作者:谢华东  2018.11.8')
    print('--------------')
    table = int(float(input('请输入公告类型(1:1采购公告,2-结果公告):\n')))
    while (table < 1 or table > 2):
        quarter = int(float(input('公告类型输入错误,请重新输入:\n')))
    path = (input('请输入要保存的地址(例如:C:\\Users\\xhdong1\\Desktop\\),不输入直接按回车表示默认当前位置:\n'))
    dict_tables = {1: 'caigou', 2: 'jieguo'}
    file_name = path + '金采网' + dict_tables[table] + '.xls'

    # 计算总共有多少页

    minfrom = int(input('请输入你需要从哪一个开始爬:\n'))
    maxto = int(input('请输入你需要截止到哪一页:\n'))
    for i in range(minfrom, maxto):
        print('正在爬取' + dict_tables[table] + '下的第' + str(i) + '页数据')
        base = 'http://www.cfcpn.com/plist/{type}?pageNo={page_num}&kflag=0&keyword=&keywordType=&province=&city=&typeOne=&ptpTwo='
        url = base.format(type=dict_tables[table], page_num=i)
        time.sleep(1)
        all_info_list = []
        html = get_page(url)
        if html == None:
            print('该页没有数据')
            continue
        url_list = parse_page(html)
        # print(url_list)
        for url in url_list:
            # print(url)
            detail_html = get_detail_page(url)
            context_list = parse_detail_page(detail_html)
            context_list.append(url)
            # print(i)
            # print(context_list)
            all_info_list.append(context_list)
        save(file_name, all_info_list)


if __name__ == '__main__':
    main()

致谢

感谢自己。

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

当法律与事业相遇

你的鼓励是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值