# -*- coding: gbk -*- import pandas as pd import re import chardet import requests from lxml import etree import time import csv import warnings warnings.filterwarnings("ignore") # 忽略运行时的警告 def get_date(url): url = "http://www.tvtv.hk/archives/category/tv" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/71.0.3578.98 Safari/537.36'} rqg = requests.get(url, headers=headers, verify=False) rqg.encoding = chardet.detect(rqg.content)['encoding'] # 修改获取到的源码的编码格式 html = etree.HTML(rqg.text) # big_title = html.xpath('//header[@class="page-header"]/h1/text()') title = html.xpath('//header[@class="entry-header"]/table/tbody/tr/td[2]/font/h2/a/@title') time_ = html.xpath('//header[@class="entry-header"]/table/tbody/tr/td[2]/div/time/@datetime') link = html.xpath('//header[@class="entry-header"]/table/tbody/tr/td[2]/font/h2/a/@href') return title, time_, link def main(): x = 'http://www.tvtv.hk/archives/category/tv/page/' url_list = [x + str(i) for i in range(1, 3)] headers = ['标题:', '发布时间:', '链接:'] values = [] # big_titles = [] titles = [] times_ = [] links = [] for url in url_list: print('>>正获取:', url) title, time_, link = get_date(url) # for b in big_title: # big_titles.append(b) for t in title: titles.append(t) for d in time_: times_.append(d) for l_ in link: links.append(l_) time.sleep(3) for i in range(len(titles)): value = [titles[i], times_[i], links[i]] values.append(value) # print(headers) # print(values) # values.insert(0, headers[0]) # print(values) # df = pd.DataFrame(values) # df.to_csv('静态解析.csv') with open("静态解析.csv", 'w', newline='') as F: write = csv.writer(F) write.writerow(headers) write.writerows(values) if __name__ == "__main__": main()