1. 爬取中国500强企业排行榜数据:企业名称、地址、2020年排名、2019年排名、营业收入、营业收入年增减、利润、利润年增减、资产、市值、股东权益
import re
import time
import requests
import pandas as pd
from lxml import etree
def get_page(url):
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3867.400 QQBrowser/10.7.4315.400"
}
try:
r=requests.get(url,headers=headers )
r.raise_for_status
r.encoding=r.apparent_encoding
return r.text
except:
return ''
def get_parse(html):
dom=etree.HTML(html)
result=dom.xpath('//tbody/tr/td[3]/a/@href')
result1=[i[12:] for i in result]
data=[]
d=[]
for i in result1:
url='http://www.fortunechina.com/'+i
htm=get_page(url)
dom1=etree.HTML(htm)
rank2019=''
#企业名称
com_name=dom1.xpath('//div[@class="comp-name"]/text()')[0]
#地址
add=dom1.xpath('//div[@class="info"]/p[1]/text()')[0]
#2020年排名
rank2020=dom1.xpath('//div[@class="con"]/em/text()')[0]
#2019年排名
r1=dom1.xpath('//span/em[@class="r2"]')[0]
r2=etree.tostring(r1)
r3=r2.decode('utf-8')
r4=re.findall('.*?>([0-9])<',r3)
if len(r4)==0:
rank2019=''
else:
rank2019=r4[0]
#营业收入
money=dom1.xpath('//div[@class="table"]//tr[2]/td[2]/text()')[0]
#营业收入年增减
money_year=dom1.xpath('//div[@class="table"]//tr[2]/td[3]/text()')[0]
#利润
profit=dom1.xpath('//div[@class="table"]//tr[3]/td[2]/text()')[0]
#利润年增减
profit_year=dom1.xpath('//div[@class="table"]//tr[3]/td[3]/text()')[0]
#资产
zc=dom1.xpath('//div[@class="table"]//tr[4]/td[2]/text()')[0]
#市值
sz=dom1.xpath('//div[@class="table"]//tr[5]/td[2]/text()')[0]
#股东权益
qy=dom1.xpath('//div[@class="table"]//tr[6]/td[2]/text()')[0]
item={
'企业名称':com_name,
'地址':add,
'2020年排名':rank2020,
'2019年排名':rank2019,
'营业收入':money,
'营业收入年增减':money_year,
'利润':profit,
'利润年增减':profit_year,
'资产':zc,
'市值':sz,
'股东权益':qy
}
data=d.append(item)
return data
def save_page(data):
if os.path.exists("中国500强企业.excel"):
data.to_csv("中国500强企业.excel",mode='a',encoding='utf-8-sig',index=False,header=False)
else:
data.to_csv("中国500强企业.excel",encoding='utf-8-sig',index=False)
print("保存成功!")
url='http://www.fortunechina.com/fortune500/c/2020-07/27/content_369925.htm'
html=get_page(url)
d=get_parse(html)
data=pd.DataFrame(d)
save_page(data)
2.爬取纵横小说网点击排行榜信息,包括排名、小说名、作者、类型、简介、点击数
网址:http://www.zongheng.com/rank/details.html?rt=5&d=1 技术路线:requests + XPath
import os
import re
import time
import requests
import pandas as pd
from lxml import etree
def get_page(url):
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3867.400 QQBrowser/10.7.4315.400"
}
try:
r=requests.get(url,headers=headers )
r.raise_for_status
r.encoding=r.apparent_encoding
return r.text
except:
return ''
def get_parse(html):
dom=etree.HTML(html)
result=dom.xpath('//div[@class="rank_d_b_name"]/a/@href')
result1=[i[12:] for i in result]
data=[]
d=[]
for i in result1:
url='http://www.fortunechina.com/'+i
htm=get_page(url)
dom1=etree.HTML(htm)
rank2019=''
com_name=dom1.xpath('//div[@class="comp-name"]/text()')[0]
add=dom1.xpath('//div[@class="info"]/p[1]/text()')[0]
rank2020=dom1.xpath('//div[@class="con"]/em/text()')[0]
r1=dom1.xpath('//span/em[@class="r2"]')[0]
r2=etree.tostring(r1)
r3=r2.decode('utf-8')
r4=re.findall('.*?>([0-9])<',r3)
if len(r4)==0:
rank2019=''
else:
rank2019=r4[0]
#营业收入
money=dom1.xpath('//div[@class="table"]//tr[2]/td[2]/text()')[0]
#营业收入年增减
money_year=dom1.xpath('//div[@class="table"]//tr[2]/td[3]/text()')[0]
#利润
profit=dom1.xpath('//div[@class="table"]//tr[3]/td[2]/text()')[0]
#利润年增减
profit_year=dom1.xpath('//div[@class="table"]//tr[3]/td[3]/text()')[0]
#资产
zc=dom1.xpath('//div[@class="table"]//tr[4]/td[2]/text()')[0]
#市值
sz=dom1.xpath('//div[@class="table"]//tr[5]/td[2]/text()')[0]
#股东权益
qy=dom1.xpath('//div[@class="table"]//tr[6]/td[2]/text()')[0]
item={
'企业名称':com_name,
'地址':add,
'2020年排名':rank2020,
'2019年排名':rank2019,
'营业收入':money,
'营业收入年增减':money_year,
'利润':profit,
'利润年增减':profit_year,
'资产':zc,
'市值':sz,
'股东权益':qy
}
data=d.append(item)
return data
def save_page(data):
if os.path.exists("纵横小说网信息.excel"):
data.to_csv("纵横小说网信息.excel",mode='a',encoding='utf-8-sig',index=False,header=False)
else:
data.to_csv("纵横小说网信息.excel",encoding='utf-8-sig',index=False)
print("保存成功!")
url='http://www.zongheng.com/rank/details.html?rt=5&d=1'
html=get_page(url)
d=get_parse(html)
data=pd.DataFrame(d)
save_page(data)
3.打开腾讯网站中关于社会招聘版块的首页。搜索某类岗位招聘信息,并将招聘信息以文件形式保存。招聘信息包括:职位名称、部门、工作地点、职位类别、发布时间以及职位简介。要求爬取多页。
● 网址:https://careers.tencent.com/search.html
● 技术路线:selenium + Chrome
import os
import time
import pandas as pd
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def search(url,keyword):
driver.get(url)
driver.maximize_window()
search_box = wait.until(EC.presence_of_element_located((By.ID, 'searchVal')))
search_box.send_keys(keyword)
search_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@type="button"]')))
search_btn.click()
time.sleep(5)
totalpage_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//ul[@class="ivu-page"]/li[last()-1]')))
totalpage = totalpage_btn.get_attribute('title')
return int(totalpage)
def get_page(page):
if page > 1:
#找到下一页按钮,最好设置一下延时等待
next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.ivu-page-next')))
#单击下一页按钮进行翻页
next_button.click()
time.sleep(5)
return driver.page_source
# 利用XPath对网页源代码进行解析
def parse_page(html):
dom = etree.HTML(html)
result=dom.xpath('//div@[class="share-list"]/div[3]/@id')
name = dom.xpath('//div[@class="book_item"]/a/p/text()')
#print(book_name)
# 链接地址,结果为相对地址
url_list = dom.xpath('//div[@class="book_item"]/a/@href')
#print(url_list)
# 在相对地址前加上host,转化为绝对地址
host = 'https://www.ptpress.com.cn'
book_url = [host+i for i in url_list]
#print(book_url)
data = pd.DataFrame({
'name':book_name,
'url':book_url
})
return data
#每页数据以追加形式保存至csv文件
def save_file(filename, data): #参数为DataFrame
if os.path.exists(filename):
data.to_csv(filename,mode='a',encoding='utf_8_sig',index=False,header=False)
else:
data.to_csv(filename,mode='a',encoding='utf_8_sig',index=False)
if __name__ == '__main__':
filename = '人民邮电出版社图书信息1.csv'
keyword = input("请输入要搜索的图书类别:")
driver = webdriver.Chrome()
url = "https://careers.tencent.com/search.html"
#wait为全局变量,设置最长等待时间为10秒
wait = WebDriverWait(driver, 10)
#打开搜索结果页面,获取总页数
total_page = search_book(url,keyword)
print(total_page)
for i in range(1,total_page+1):
html = get_page(i)
data = parse_page(html)
save_file(filename, data)
print("第{}页爬取完成!".format(i))
driver.close()
4.一次爬取多页爬取携程网中北京欢乐谷用户评论信息:用户名、评分、评论内容、评论日期信息
def webpage():
import requests
import json
import pandas
begin=int(input("请输入爬取起始页:"))
end=int(input("请输入爬取终止页:"))
uname=[]
score=[]
comments=[]
commentdate=[]
for i in range(begin,end+1):
url="https://sec-m.ctrip.com/restapi/soa2/12530/json/viewCommentList?_fxpcqlniredt=