## 优化代码4
# 抓取的发布时间
ori_date = selector.xpath('//div[@class="short_r"]/text()')
list_date = "".join([str(x) for x in ori_date])
# 日期格式化核心代码
list_time = time.mktime(time.strptime(list_date, "%Y-%m-%d"))
crawl_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(list_time))
# 将str中的汉字换掉
time = time.replace(r'年','-').replace(r'月','-').replace(r'日','')
# 中文日期格式转换
date_str = 'September-26-2012'
print(date_str)
date = datetime.strptime(date_str, "%B-%d-%Y")
print(date)
## 优化代码5
# 获取列表的链接 soup模式
def parse_num(self, response):
# 取得response中的text文本
soup = BeautifulSoup(response.text, 'html.parser')
# 查找所有有关的节点
urls_list = []
urls_div = soup.find_all('ul', class_="submenu-dropbox_subtabs_content fixmt10")
for item in urls_div:
list_tmp = item.find_all('a')
for a in list_tmp:
urls_p = "http://fgw.shanxi.gov.cn/"
# 每个详情页链接拼接
urls = urljoin(urls_p, a.get('href'))
urls_list.append(urls)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
'Cookie': '_trs_uv=k8bbz1t7_695_505y; _trs_ua_s_1=k8bitkz9_1941_fyps'
}
31、Python日常抓取数据的高效代码
最新推荐文章于 2024-04-20 17:02:27 发布