import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import scrapy
def get(company):
search_url = 'XXX/key='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'
}
try:
response = requests.get(search_url + company, headers=headers)
selector = scrapy.Selector(text=response.text)
address = selector.xpath('xxx').extract()[0]
print(address)
return address
except Exception as e:
pass
# 定义输入文件夹和输出文件路径
input_folder = os.path.join(os.path.expanduser('~'), 'Desktop', 'xxx')
output_file = os.path.join(os.path.expanduser('~'), 'Desktop', 'xxx.xlsx')
futian_file = os.path.join(os.path.expanduser('~'), 'Desktop', 'xxx.xlsx')
# 初始化数据列表
data = []
# 遍历输入文件夹中的每个Excel文件,读取其中的企业名称列并查询,将结果保存到一个DataFrame对象中
for filename in os.listdir(input_folder):
if not filename.endswith('.xlsx'):
continue
filepath = os.path.join(input_folder, filename)
excel_data = pd.read_excel(filepath,usecols=['Company'],header=0,sheet_name='Sheet1')
companies = excel_data['Company'].tolist()
print(filename)
for company in companies:
# 如果该公司已经被查询过了,则跳过
if any(d['企业名称'] == company for d in data):
continue
# 查询获取地址信息
address= get(company)
# 将公司名称和地址信息添加到数据列表中
data.append({'企业名称': company, '地址': address})
# 将数据列表转换为DataFrame对象
df = pd.DataFrame(data)
# 将DataFrame对象保存到输出文件中
df.to_excel(output_file, index=False)