Python3爬虫-06-爬取企业信息导出Excel表格

一张有马塞克的图


批量抓取企信宝页面(前200页吧)
'''
#导入需要的库
import requests
import urllib.request,socket,re,sys,os
import ssl
import fileinput
import time
import random
import xlrd
import xlwt
from openpyxl import load_workbook
from openpyxl import Workbook
from bs4 import BeautifulSoup
ssl._create_default_https_context = ssl._create_unverified_context
#定义文件保存路径
targetPath = "//Users//wangleilei//Documents//03__douban_Images"



    # 定义保存函数
def saveFile(data):
        # 路径替换成你自己的
        path = "//Users//wangleilei//Documents//公司名称.txt"
        f = open(path, 'a')
        f.write(data)
        f.write('\n')
        f.close()
# 网址

def getData(index1):

    temp=index1

    url = "http://www.qixin.com/search?key=%E6%97%85%E6%B8%B8&page=" + temp + "&status[]=1"

    print(url)

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0',
               'Cookie': '_zg=%7B%22uuid%22%3A%20%221604427941638e-0e3eb0a6fa80588-49566e-13c680-160442794174ec%22%2C%22sid%22%3A%201513046075.538%2C%22updated%22%3A%201513047400.161%2C%22info%22%3A%201512971932703%2C%22cuid%22%3A%20%228449f8dd-5c6a-4768-b489-f34053c20d77%22%7D; showsale=1; cookieShowLoginTip=1; Hm_lvt_52d64b8d3f6d42a2e416d59635df3f71=1512971936,1513041298,1513042473; responseTimeline=269; Hm_lpvt_52d64b8d3f6d42a2e416d59635df3f71=1513047389; channel=baidu; sid=s%3At4yyT47_qFqkQTz6UKIZi5agdUBPtiFC.TEo%2BGM8VoEPsLki31uC0Sr%2FOFIg%2BbobaL6sYbAZbGCM'}
    # request = urllib.request.Request(url=url, headers=headers)
    #
    # response = urllib.request.urlopen(request)

    data = requests.get(url,headers=headers)
    wbdata = data.text
    soup = BeautifulSoup(wbdata, 'lxml')
    companys = soup.select("div.col-2 > div.col-2-1 > div.company-title > a")
    legalpersons = soup.select("div.col-2 > div.col-2-1 > div.legal-person")
    mails = soup.select("div.col-2 > div.col-2-1 > div.legal-person > span > a ")

    print(legalpersons)

    for i in range(0,10):
        # print("公司名称")
        company = companys[i];
        # print(company.get_text())
        p = legalpersons[i*3]

        t = legalpersons[i*3 + 1]

        a = legalpersons[i*3 + 2]

        # print(p.get_text())
        #
        # print(a.get_text())
        #
        # print(t.get_text())

        # 例3:字符串查找
        # str = 'a,hello'
        # print str.find('hello')  # 在字符串str里查找字符串hello
        # >> 2  #

        tStart = t.get_text().find('电话')

        # print(tStart)

        aStart = t.get_text().find('邮箱')

        # print(aStart)

        dataDict = {
            "公司名称":company.get_text(),
            "公司法人":p.get_text()[6:],
            "联系电话":t.get_text()[tStart+3:aStart],
            "邮箱": t.get_text()[aStart + 3:],
            "地址":a.get_text()[3:]
        }

        wb = load_workbook("//Users//wangleilei//Documents//旅游公司资料.xlsx")
        # 显示创建文件时自带的文件名称
        print (wb.get_sheet_names())
        # 获取名为“Sheet”的工作表
        ws = wb.get_sheet_by_name("Sheet1")
        # print (ws.cell(row=1, column=1).value)
        # 遍历ws中的内容


        tempColumn = 0;
        for key in dataDict.keys():
          # print(key,":",dataDict[key])
          print(int(temp))
          tempColumn = tempColumn + 1
          tempRow = int(index1) * 10 + i + 1
          d = ws.cell(row=tempRow, column=tempColumn)
          d.value = dataDict[key]
          print (d.value)
          # wb.save("//Users//wangleilei//Documents//旅游公司资料.xlsx")


        wb.save("//Users//wangleilei//Documents//旅游公司资料.xlsx")
        print(dataDict)






def read_excel():
    workbook = xlrd.open_workbook(r'//Users//wangleilei//Documents//旅游公司资料.xlsx')
    print(workbook.sheet_names())#['Sheet1', 'Sheet2', 'Sheet3']
    sheet1_name = workbook.sheet_names()[0]#选择sheet1
    #根据sheet索引或者名称获取sheet内容'
    # sheet1 = workbook.sheet_by_index(0)#
    sheet1 = workbook.sheet_by_name('Sheet1')

    print(sheet1.name, sheet1.nrows,sheet1.ncols)

    #获取第一行内容

    rows = sheet1.row_values(1)

    print(rows)


def writeData():
    wb = load_workbook("//Users//wangleilei//Documents//旅游公司资料.xlsx")
    # 显示创建文件时自带的文件名称
    print (wb.get_sheet_names())
    # 获取名为“Sheet”的工作表
    ws = wb.get_sheet_by_name("Sheet1")
    # print (ws.cell(row=1, column=1).value)
    # 遍历ws中的内容
    for a, b, c in ws["A1":"C4"]:
        print (a.value, b.value, c.value)

    d = ws.cell(row = 3,column = 3)
    d.value ="qq"
    print (d.value)
    wb.save("//Users//wangleilei//Documents//旅游公司资料.xlsx")
    # 工作表的长、宽
    # print (len(ws.columns), len(ws.rows))

# writeData()
# read_excel()
# getData(str(2))


i=18
while i < 300:
    print (i)
    string = str(i)
    getData(string)
    i = i + 1
    a=random.randrange(0, 20)
    time.sleep(a)
    print("随机数")


复制代码

我的Python3爬虫系列

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值