import requests
import pandas
from lxml import etree
class Spider(object):
def __init__(self):
self.url = 'https://baike.baidu.com/item/%E9%9D%92%E6%98%A5%E6%9C%89%E4%BD%A0%E7%AC%AC%E4%BA%8C%E5%AD%A3'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_data(self):
response = requests.get(self.url, headers=self.headers).content.decode()
page = etree.HTML(response)
node_list = page.xpath('//div[@class="main-content"]//table[@class="table-view log-set-param"]//tr')
data_list = []
for node in node_list[21:-2]:
# 名字
name = node.xpath('./td[1]/a/text()')[0]
# 详情页
detail_link = 'https://baike.baidu.com' + node.xpath('./td[1]/a/@href')[0]
# 地区
position = node.xpath('./td[2]/text()')[0]
# 星座
star = node.xpath('./td[3]/text()')[0]
# 身高
height = node.xpath('./td[4]/text()')[0]
# 体重
weight = node.xpath('./td[5]/text()')[0]
# 花语
content = node.xpath('./td[6]/text()')[0]
# 经济公司
company = node.xpath('./td[7]//a/text()')
company = company[0] if company else node.xpath('./td[7]/text()')[0]
data = [name, detail_link, position, star, height, weight, content, company]
print(data)
data_list.append(data)
return data_list
def save_excel(self, data_list):
df = pandas.DataFrame(data_list, columns=['姓名', '详情页', '地区', '星座', '身高', '体重', '花语', '经济公司'])
df.to_excel('baike.xlsx', index=None)
def run(self):
data_list = self.get_data()
self.save_excel(data_list)
if __name__ == '__main__':
s = Spider()
s.run()
pandas 写入excel
最新推荐文章于 2024-08-26 22:31:41 发布