# 新浪财经数据采集
import requests
import pymongo
import time
from selenium import webdriver
from bs4 import BeautifulSoup
# from fake_useragent import UserAgent
# ua_list = UserAgent()
ua_list= 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
def get_hej_news():
"""爬取华尔街见闻宏观新闻"""
client = pymongo.MongoClient('localhost', 27017)
news = client['news']
hej_news = news['hej_news']
chromedriver = r"/usr/local/share/chromedriver"
driver = webdriver.Chrome(chromedriver)
# 使用get()方法打开待抓取的URL
driver.get('https://wallstreetcn.com/live/global')
# 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距
js = 'window.scrollBy(0,3000)'
driver.execute_script(js)
time.sleep(5)
js = 'window.scrollBy(0,5000)'
driver.execute_script(js)
time.sleep(5)
pages = driver.page_source
soup = BeautifulSoup(pages, 'html.parser')
soup1 = soup.find('div', class_='livenews')
content = soup1.find_all('div', class_='live-item')
for i in content:
new_time = i.find('span', attrs={'class': 'live-item__time__text'}).get_text(),
news = i.find('div', attrs={'class': 'content-html'}).get_text().strip().replace('\n|//', '')
isexit = hej_news.count({'new_time': new_time})
if isexit != 0:
hej_news.remove({'new_time': new_time})
data = {
'new_time': new_time,
'news': news
}
hej_news.insert_one(data)
driver.close()
driver.quit()
print('存储华尔街见闻宏观新闻成功')
def get_xlcj_news():
"""爬取新浪财经突发live板块新闻"""
client = pymongo.MongoClient('localhost', 27017)
news = client['news']
xlcj_news = news['xlcj_news']
num = 1
while num < 7:
chromedriver = r"/usr/local/share/chromedriver"
driver = webdriver.Chrome(chromedriver)
url = 'http://live.sina.com.cn/zt/app_zt/f/v/finance/globalnews1/?page=' + str(num)
# 使用get()方法打开待抓取的URL
driver.get(url)
# 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距
js = 'window.scrollBy(0,3000)'
driver.execute_script(js)
time.sleep(5)
js = 'window.scrollBy(0,5000)'
driver.execute_script(js)
time.sleep(5)
pages = driver.page_source
soup = BeautifulSoup(pages, 'html.parser')
soup1 = soup.find('div', class_='bd_list')
content = soup1.find_all('div', class_='bd_i_og')
num += 1
for i in content:
news_time = i.find('p', attrs={'class': 'bd_i_time_c'}).get_text().strip()
news_type = i.find('p', attrs={'class': 'bd_i_tags'}).get_text().strip().replace("\n", "")
news = i.find('p', attrs={'class': 'bd_i_txt_c'}).get_text()
print(news_time,news_type,news)
isexit = xlcj_news.count({'news_time': news_time})
if isexit != 0:
xlcj_news.remove({'news_time': news_time})
data = {
'news_time': news_time,
'news_type': news_type,
'news': news
}
xlcj_news.insert_one(data)
driver.close()
driver.quit()
print('新浪财经突发live板块新闻存储成功')
def main():
# his_time = input('请输入要查询的新闻时间(格式:2017-11-2 00:00:00):')
# history_time = str(time.mktime(time.strptime(his_time, '%Y-%m-%d %H:%M:%S'))).replace('.0', '')
get_hej_news()
get_xlcj_news()
if __name__ == '__main__':
main()
复制代码
使用selenium抓取华尔街见闻和新浪财经数据
最新推荐文章于 2021-02-03 14:54:52 发布