Python爬虫爬取世界各地时间

最新推荐文章于 2024-06-14 09:50:01 发布

我真的搞不懂啊

最新推荐文章于 2024-06-14 09:50:01 发布

阅读量102

点赞数 3

文章标签： python 爬虫爬山算法网络爬虫原型模式

本文链接：https://blog.csdn.net/weixin_45781685/article/details/138523367

版权

import requests
import json
import re
from requests.exceptions import RequestException
import time

//定义方法：获取单页的内容
def get_one_page(url):
    try:
    	//定义请求头
        headers = {
            'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_13_3)AppleWebKit/537.36(KHTML,like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

//定义方法：解析当前页内容
def parse_one_page(html):
	//通过正则表达式来选中页面元素中需要爬取的文本内容
    pattern = re.compile('<li.*?bg.*?title.*?>(.*?)</a>(.*?)</li>')
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'location': item[0],
            'time': item[1]
        }

//定义方法：写进文档中
def write_to_file(content):
    with open('text.txt', 'a', encoding='utf-8')as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')

//主方法：运行方法
def main():
	//url为当前需要爬取的时间
    url = 'http://www.24timemap.com/' 
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

//启动主方法开始运行
if __name__ == '__main__':
    main()