爬虫04-网易科技新闻

最新推荐文章于 2024-08-03 15:58:00 发布

qwerLoL123456

最新推荐文章于 2024-08-03 15:58:00 发布

阅读量213

点赞数

分类专栏：爬虫各大网站尝试

本文链接：https://blog.csdn.net/qwerLoL123456/article/details/83143193

版权

爬虫各大网站尝试专栏收录该内容

13 篇文章 0 订阅

订阅专栏

"""
__title__ = ''
__author__ = 'Thompson'
__mtime__ = '2018/7/26'
# code is far away from bugs with the god animal protecting
    I love animals. They taste delicious.
              ┏┓      ┏┓
            ┏┛┻━━━┛┻┓
            ┃      ☃      ┃
            ┃  ┳┛  ┗┳  ┃
            ┃      ┻      ┃
            ┗━┓      ┏━┛
                ┃      ┗━━━┓
                ┃  神兽保佑    ┣┓
                ┃　永无BUG！   ┏┛
                ┗┓┓┏━┳┓┏┛
                  ┃┫┫  ┃┫┫
                  ┗┻┛  ┗┻┛
"""

from selenium import webdriver
import time
import random
from bs4 import BeautifulSoup
import json

browser = webdriver.Chrome()

browser.get("http://tech.163.com/")
last_height = browser.execute_script("return document.body.scrollHeight")
while True:
    print('页面加载中...')
    # 滑动一次
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # 等待加载
    time.sleep(random.random()*10)
    # 计算新的滚动高度并与上一个滚动高度进行比较
    new_height = browser.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
html = browser.page_source
#print(html)
browser.close()

# 数据提取
soup = BeautifulSoup(html,'lxml')
#print(soup.prettify())
ls = soup.select('div.data_row.news_article.clearfix')
print(len(ls))

file = open('./data/163tech.json', 'w', encoding='utf-8')
for item in ls:
    title = item.select('h3 > a')[0].get_text()
    print('title:',title)
    url = item.select('h3 > a')[0]['href']
    print('url:', url)
    content = json.dumps({'title':title,'url':url}, ensure_ascii=False) + "\n"
    file.write(content)
file.close()

file = open('./data/163tech.json', 'r', encoding='utf-8')

ls = file.readlines()
for it in ls:
    print(json.loads(it))