一张有马塞克的图
批量抓取企信宝页面(前200页吧)
'''
#导入需要的库
import requests
import urllib.request,socket,re,sys,os
import ssl
import fileinput
import time
import random
import xlrd
import xlwt
from openpyxl import load_workbook
from openpyxl import Workbook
from bs4 import BeautifulSoup
ssl._create_default_https_context = ssl._create_unverified_context
#定义文件保存路径
targetPath = "//Users//wangleilei//Documents//03__douban_Images"
# 定义保存函数
def saveFile(data):
# 路径替换成你自己的
path = "//Users//wangleilei//Documents//公司名称.txt"
f = open(path, 'a')
f.write(data)
f.write('\n')
f.close()
# 网址
def getData(index1):
temp=index1
url = "http://www.qixin.com/search?key=%E6%97%85%E6%B8%B8&page=" + temp + "&status[]=1"
print(url)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0',
'Cookie': '_zg=%7B%22uuid%22%3A%20%221604427941638e-0e3eb0a6fa80588-49566e-13c680-160442794174ec%22%2C%22sid%22%3A%201513046075.538%2C%22updated%22%3A%201513047400.161%2C%22info%22%3A%201512971932703%2C%22cuid%22%3A%20%228449f8dd-5c6a-4768-b489-f34053c20d77%22%7D; showsale=1; cookieShowLoginTip=1; Hm_lvt_52d64b8d3f6d42a2e416d59635df3f71=1512971936,1513041298,1513042473; responseTimeline=269; Hm_lpvt_52d64b8d3f6d42a2e416d59635df3f71=1513047389; channel=baidu; sid=s%3At4yyT47_qFqkQTz6UKIZi5agdUBPtiFC.TEo%2BGM8VoEPsLki31uC0Sr%2FOFIg%2BbobaL6sYbAZbGCM'}
# request = urllib.request.Request(url=url, headers=headers)
#
# response = urllib.request.urlopen(request)
data = requests.get(url,headers=headers)
wbdata = data.text
soup = BeautifulSoup(wbdata, 'lxml')
companys = soup.select("div.col-2 > div.col-2-1 > div.company-title > a")
legalpersons = soup.select("div.col-2 > div.col-2-1 > div.legal-person")
mails = soup.select("div.col-2 > div.col-2-1 > div.legal-person > span > a ")
print(legalpersons)
for i in range(0,10):
# print("公司名称")
company = companys[i];
# print(company.get_text())
p = legalpersons[i*3]
t = legalpersons[i*3 + 1]
a = legalpersons[i*3 + 2]
# print(p.get_text())
#
# print(a.get_text())
#
# print(t.get_text())
# 例3:字符串查找
# str = 'a,hello'
# print str.find('hello') # 在字符串str里查找字符串hello
# >> 2 #
tStart = t.get_text().find('电话')
# print(tStart)
aStart = t.get_text().find('邮箱')
# print(aStart)
dataDict = {
"公司名称":company.get_text(),
"公司法人":p.get_text()[6:],
"联系电话":t.get_text()[tStart+3:aStart],
"邮箱": t.get_text()[aStart + 3:],
"地址":a.get_text()[3:]
}
wb = load_workbook("//Users//wangleilei//Documents//旅游公司资料.xlsx")
# 显示创建文件时自带的文件名称
print (wb.get_sheet_names())
# 获取名为“Sheet”的工作表
ws = wb.get_sheet_by_name("Sheet1")
# print (ws.cell(row=1, column=1).value)
# 遍历ws中的内容
tempColumn = 0;
for key in dataDict.keys():
# print(key,":",dataDict[key])
print(int(temp))
tempColumn = tempColumn + 1
tempRow = int(index1) * 10 + i + 1
d = ws.cell(row=tempRow, column=tempColumn)
d.value = dataDict[key]
print (d.value)
# wb.save("//Users//wangleilei//Documents//旅游公司资料.xlsx")
wb.save("//Users//wangleilei//Documents//旅游公司资料.xlsx")
print(dataDict)
def read_excel():
workbook = xlrd.open_workbook(r'//Users//wangleilei//Documents//旅游公司资料.xlsx')
print(workbook.sheet_names())#['Sheet1', 'Sheet2', 'Sheet3']
sheet1_name = workbook.sheet_names()[0]#选择sheet1
#根据sheet索引或者名称获取sheet内容'
sheet1 = workbook.sheet_by_name('Sheet1')
print(sheet1.name, sheet1.nrows,sheet1.ncols)
rows = sheet1.row_values(1)
print(rows)
def writeData():
wb = load_workbook("//Users//wangleilei//Documents//旅游公司资料.xlsx")
print (wb.get_sheet_names())
ws = wb.get_sheet_by_name("Sheet1")
for a, b, c in ws["A1":"C4"]:
print (a.value, b.value, c.value)
d = ws.cell(row = 3,column = 3)
d.value ="qq"
print (d.value)
wb.save("//Users//wangleilei//Documents//旅游公司资料.xlsx")
i=18
while i < 300:
print (i)
string = str(i)
getData(string)
i = i + 1
a=random.randrange(0, 20)
time.sleep(a)
print("随机数")
复制代码
我的Python3爬虫系列