"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/7/26'
# code is far away from bugs with the god animal protecting
I love animals. They taste delicious.
┏┓ ┏┓
┏┛┻━━━┛┻┓
┃ ☃ ┃
┃ ┳┛ ┗┳ ┃
┃ ┻ ┃
┗━┓ ┏━┛
┃ ┗━━━┓
┃ 神兽保佑 ┣┓
┃ 永无BUG! ┏┛
┗┓┓┏━┳┓┏┛
┃┫┫ ┃┫┫
┗┻┛ ┗┻┛
"""
from selenium import webdriver
import time
import random
from bs4 import BeautifulSoup
import json
browser = webdriver.Chrome()
browser.get("http://tech.163.com/")
last_height = browser.execute_script("return document.body.scrollHeight")
while True:
print('页面加载中...')
# 滑动一次
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# 等待加载
time.sleep(random.random()*10)
# 计算新的滚动高度并与上一个滚动高度进行比较
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
html = browser.page_source
#print(html)
browser.close()
# 数据提取
soup = BeautifulSoup(html,'lxml')
#print(soup.prettify())
ls = soup.select('div.data_row.news_article.clearfix')
print(len(ls))
file = open('./data/163tech.json', 'w', encoding='utf-8')
for item in ls:
title = item.select('h3 > a')[0].get_text()
print('title:',title)
url = item.select('h3 > a')[0]['href']
print('url:', url)
content = json.dumps({'title':title,'url':url}, ensure_ascii=False) + "\n"
file.write(content)
file.close()
file = open('./data/163tech.json', 'r', encoding='utf-8')
ls = file.readlines()
for it in ls:
print(json.loads(it))