【爬虫专栏12】xpath,正则,json爬取斗鱼直播

xpath和正则

import requests
import re
from requests.exceptions import  RequestException
import csv
from lxml import etree
import pandas as pd 


for i in range(1):
    url = 'https://www.douyu.com/directory/all'

    def get_one_page(url):
        try:
            headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
            response = requests.get(url, headers = headers)
            if response.status_code==200:
                return response.text
            return None
        except RequestException:
            return None

    def parse_one_page(html):
        #csv的a+性质表示追加,这个和pandas的to_csv的mode='a'是一样的道理
        csv_file = open(r'E:\vscode_code\练习\斗鱼\douyu_data.csv', 'w', newline='', encoding='utf-8-sig')  # 解决中文乱码问题
        writer = csv.writer(csv_file)
        writer.writerow(['名称', '类型', '主播', '热度'])
        
        pattern1 = re.compile(r'<h3 class="DyListCover-intro" title=(.*?)>',re.S)##正则
        pattern2 = re.compile(r'<span class="DyListCover-zone">(.*?)</span>',re.S)##正则
        pattern3 = re.compile(r'<class="DyListCover-hotIcon">(.*)</span>',re.S)##正则
        pattern4 = re.compile(r'<span class="DyListCover-hot">(.*)</span>',re.S)##正则
        
        names = re.findall(pattern1,html) 
        nums = re.findall(pattern2,html) 
        
        s = etree.HTML(html)
        #网上代码剪贴过来的中英符号要注意,尤其是引号冒号这种不然会报错Invalid expression这种
        file=s.xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a[1]/div[2]/div[2]/h2/text()')
        file2=s.xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a[1]/div[2]/div[2]/span/text()')
        
        ''' 
        data = {'names':names, 'nums':nums, 'people':file, 'num':file2}
        basic_data = pd.DataFrame.from_dict(data = data)
        '''
        basic_data.to_csv(r'E:\vscode_code\爬虫测试\B站\Bzhan2.csv', index=False, header=False)
        print(basic_data)
        csv_file.close()

    def main():
        html = get_one_page(url)
        print('打印第',(i+1),'页')
        parse_one_page(html)

    if __name__=='__main__':
        main()

json

import requests
import re
from requests.exceptions import  RequestException
import csv
from lxml import etree
import pandas as pd 


for i in range(1):
    url = 'https://www.douyu.com/directory/all'

    def get_one_page(url):
        try:
            headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
            response = requests.get(url, headers = headers)
            if response.status_code==200:
                return response.text
            return None
        except RequestException:
            return None

    def parse_one_page(html):
        #csv的a+性质表示追加,这个和pandas的to_csv的mode='a'是一样的道理
        csv_file = open(r'E:\vscode_code\练习\斗鱼\douyu_data.csv', 'w', newline='', encoding='utf-8-sig')  # 解决中文乱码问题
        writer = csv.writer(csv_file)
        writer.writerow(['名称', '类型', '主播', '热度'])
        
        pattern1 = re.compile(r'<h3 class="DyListCover-intro" title=(.*?)>',re.S)##正则
        pattern2 = re.compile(r'<span class="DyListCover-zone">(.*?)</span>',re.S)##正则
        pattern3 = re.compile(r'<class="DyListCover-hotIcon">(.*)</span>',re.S)##正则
        pattern4 = re.compile(r'<span class="DyListCover-hot">(.*)</span>',re.S)##正则
        
        names = re.findall(pattern1,html) 
        nums = re.findall(pattern2,html) 
        
        s = etree.HTML(html)
        #网上代码剪贴过来的中英符号要注意,尤其是引号冒号这种不然会报错Invalid expression这种
        file=s.xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a[1]/div[2]/div[2]/h2/text()')
        file2=s.xpath('//*[@id="listAll"]/section[2]/div[2]/ul/li/div/a[1]/div[2]/div[2]/span/text()')
        
        ''' 
        data = {'names':names, 'nums':nums, 'people':file, 'num':file2}
        basic_data = pd.DataFrame.from_dict(data = data)
        '''
        basic_data.to_csv(r'E:\vscode_code\爬虫测试\B站\Bzhan2.csv', index=False, header=False)
        print(basic_data)
        csv_file.close()

    def main():
        html = get_one_page(url)
        print('打印第',(i+1),'页')
        parse_one_page(html)

    if __name__=='__main__':
        main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值