import nltk
from dateutil import parser
from dateutil import tz
import datetime
from lxml import html
import lxml.html.clean
from bs4 import BeautifulSoup
# 使用dateutil解析日期和时间
print(parser.parse('Thu Sep 25 10:36:28 2010'))
# 2010-09-25 10:36:28
print(parser.parse('Thursday, 25. September 2010 10:36AM'))
# 2010-09-25 10:36:00
print(parser.parse('9/25/2010 10:36:28'))
# 2010-09-25 10:36:28
print(parser.parse('9/25/2010'))
# 2010-09-25 00:00:00
print(parser.parse('2010-09-25T10:36:28Z'))
# 2010-09-25 10:36:28+00:00
# day first
print(parser.parse('25/9/2010', dayfirst=True))
# 2010-09-25 00:00:00
print(parser.parse('10-9-25'))
# 2025-10-09 00:00:00
print(parser.parse('10-9-25', yearfirst=True))
# 2010-09-25 00:00:00
print(parser.parse('9/25/2010 at about 10:36AM', fuzzy=True))
# 2010-09-25 10:36:00
# 时区的查找和转换
tz.tzutc()
print(tz.tzutc().utcoffset(datetime.datetime.utcnow()))
print(datetime.timedelta(0))
print(tz.gettz('US/Pacific'))
# 使用lxml从HTML中提取URL
# lxml是一个结合了C库libxml2和libxslt的Python库
doc = html.fromstring('Hello <a href="www.baidu.com">world</a>')
print(doc)
# <Element p at 0x212f696b228>
links = list(doc.iterlinks())
len(links)
(el, attr, link, pos) = links[0]
print(el, attr, link, pos)
# <Element a at 0x175be942ea8> href /world 0
doc.make_links_absolute('http://hello')
abslinks = list(doc.iterlinks())
(el, attr, link, pos) = abslinks[0]
print(link)
# http://hello/www.baidu.com
links = list(html.iterlinks('Hello <a href="/world">world</a>'))
print(links[0][2])
# 清理和剥离HTML
print(lxml.html.clean.clean_html('<html><head></head><body onload=loadfunc()>my text</body></html>'))
# <div><body>my text</body></div>
# 使用Beautiful Soup转换HTML实体
print(BeautifulSoup('<', features="lxml").string)
print(BeautifulSoup('&', features="lxml").string)