# -*- encoding=UTF-8 -*- ''' author:vfast name:spider data:2021/6/24 ''' import csv import re from lxml import etree import requests user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0' headers = {'User-Agent': user_agent} r = requests.get('http://seputu.com/', headers=headers) # 使用lxml解析网页 html = etree.HTML(r.text) div_mulus = html.xpath('.//*[@class="mulu"]') pattern = re.compile(r'\s*\[(.*)\]\s+(.*)') rows = [] for div_mulu in div_mulus: #找到所有的div_h2标签 div_h2 = div_mulu.xpath('./div[@class="mulu-title"]/center/h2/text()') if len(div_h2) > 0: h2_title = div_h2[0].encode('utf-8') a_s = div_mulu.xpath('./div[@class="box"]/ul/li/a') for a in a_s: href = a.xpath('./@href')[0].encode('utf-8') box_title = a.xpath('./@title')[0].encode('utf-8') match = pattern.search(box_title) if match != None: date = match.group(1).encode('utf-8') real_title = match.group(2) content = (h2_title, real_title, href, date) rows.append(content) headers = ['title', 'real_title', 'href', 'date'] with open('qiye.csv', 'w') as f: f_csv = csv.writer(f) f_csv.writerow(headers) f_csv.writerows(rows)
来源:学习Python爬虫开发与项目实战记录