from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import time
import logging.handlers
import traceback
# 获得logging对象
mylog = logging.getLogger(__name__)
# 设定日志级别
mylog.setLevel(logging.DEBUG)
# 让日志输出到console
console = logging.StreamHandler()
console.setLevel(logging.INFO)
console.setFormatter(logging.Formatter('%(asctime)s %(message)s'))
mylog.addHandler(console)
# 储存所有公司具体信息网址
urls_list = []
def selenium_deal(url, company_name):
"""
通过selenium模仿用户操作浏览器, driver
"""
driver.get(url)
driver.implicitly_wait(10) # 等10s
phone = driver.find_element_by_xpath(
'//div[@class="modulein modulein1 mobile_box pl30 pr30 f14 collapse in"]/div[2]/input')
password = driver.find_element_by_xpath(
'//div[@class="modulein modulein1 mobile_box pl30 pr30 f14 collapse in"]/div[3]/input')
login_button = driver.find_element_by_xpath(
'//div[@class="modulein modulein1 mobile_box pl30 pr30 f14 collapse in"]/div[5]')
phone.send_keys('15871157861')
password.send_keys('woshixxc11111')
login_button.click()
# 输入关键字查找指定公司,然后通过selenium点击进入子网页
input_for_company = driver.find_element_by_id('home-main-search')
input_for_company.send_keys(company_name)
search_company = driver.find_element_by_xpath(
'//div[@class="input-group inputV2"]/div[@class="input-group-addon search_button white-btn "]')
search_company.click()
return driver
def page_change():
"""
根据上一个html,不停的点下一页,得到下一页html
"""
time.sleep(10)
# result = driver.find_element_by_xpath('//li[@class="pagination-next"]/a')
# result = driver.find_element_by_link_text('<')
next_url = driver.find_elements_by_css_selector("li.pagination-next.ng-scope > a")[0].get_attribute('href')
driver.get(next_url)
mylog.info(' * 下一页生成成功')
return driver.page_source
def html_data_get(html):
"""
根据每个html的内容,找到对应的公司具体信息网址
"""
# 根据html找到具体公司的详细信息
req = bs(html, 'lxml')
divs = req.select('div.b-c-white.search_result_container > div')
# 遍历每一个div(一个div就是一个公司,其中有公司详细信息对应的网址,这里收集这些网址)
for div in divs:
url = div.select('div.search_right_item.ml10 > div')[0].select('a')[0].attrs['href']
urls_list.append(url)
print(urls_list)
def main():
"""
主函数
"""
selenium_deal('https://www.tianyancha.com/login', '联想')
# 得到指定网页中所有公司的网址
html_data_get(driver.page_source)
html = page_change()
# 第二次、第三次...
i = 0
while i < 1:
i += 1
try:
if html:
# 如果html存在,就将url存入url_list中
html_data_get(html)
html = page_change()
else:
break
except:
mylog.error(traceback.format_exc())
company_analyse()
def company_analyse():
"""
取得每个公司的详细信息
"""
for i in urls_list:
driver.get(i)
html = driver.page_source
# 解析公司信息内容,将内容写入到文件中
req = bs(html,'lxml')
td_list = req.select('div #_container_baseInfo > div > div.baseInfo_model2017 > table > tbody > tr > td')
for td in td_list:
print(td)
if __name__ == '__main__':
driver = webdriver.Chrome(executable_path=r'C:\Users\your_user\AppData\Local\Google\Chrome\Application\chromedriver.exe')
main()