爬虫

这几天通过爬取一些新闻语料,总结一下少许经验:
1,可以手动寻找网页规律,将需要爬取的网址写入txt中。
2,设置header,模拟浏览器,以免发现是爬虫而中断

self.header = {
            # 'cookie': '__gads=ID=510d384eea0592e5:T=1561861082:S=ALNI_Ma9KgyBvcWfhnUp8XkVkIB6TJRzog; _ga=GA1.2.1810481865.1561861082; vruid=e774-e455-6bbe-1561867251; has_js=1; Hm_lvt_94444c9ea980451456647b253e40ad9f=1564705106,1565263545,1565447158,1565675611; _gid=GA1.2.4428587.1565675613; sessionStatusZB=1; pa-submit=.3114-1565675835825; Hm_lpvt_94444c9ea980451456647b253e40ad9f=1565675883',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        }

3,记得关闭响应,伪装的更加细节:

res.close()
time.sleep(1)

4,爬取动态网页:

首先F12,找到network,找到JS,可以自己手动滑动鼠标观察生成的新文件,点开这个新文件,找到
Request URL: https://www.cyberctm.com/zh_TW/news/lazyload?type=%E5%9C%8B%E9%9A%9B%E6%96%B0%E8%81%9E&page=2&limit=10&offset=10
这个就是动态链接,搜索它,其实是json文件,通过下面代码加载:
from urllib import request

with request.urlopen(url, timeout=10) as response: #url=上面的Request URL
    content = response.read()
print(content)
data_list = json.loads(content)
print(type(data_list))
print(len(data_list))
for i in data_list:
    text_1 = i['content_t']
    text_2 = text_1.replace('<P>', '')
    text_3 = text_2.replace('</P>', '')
    text_4 = text_2.replace('&nbsp;', '')

    print(text_4)

4,实例代码:

from bs4 import BeautifulSoup
import re, json
import requests, os
from datetime import datetime
import numpy as np
import random


class downloader(object):
    def __init__(self, url_target):
        self.target = url_target  # 目标网址
        self.header = {
            # 'cookie': '__cfduid=d9f7d437c55121035c533fec00ae560551565261934; cf_clearance=1bf1420cd0d784aed9a3a990968d1755787e5cd7-1565261948-604800-250; _ga=GA1.3.1012363973.1565261950; EU_COOKIE_LAW_CONSENT=true; _ym_uid=1565262940639941638; _ym_d=1565262940; ivx_990498=55; ivx_990397=265; ivx_990372=180; ivx_990158=60; _gid=GA1.3.432812653.1565671800; _ym_isad=1; _ym_wasSynced=%7B%22time%22%3A1565675010184%2C%22params%22%3A%7B%22eu%22%3A0%7D%2C%22bkParams%22%3A%7B%7D%7D; _dc_gtm_UA-33948949-1=1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        }

    def get_fanye_url(self, target):
        req = requests.get(url=target, headers=self.header)  # 获取对象
        req.encoding = "utf-8"  # 设置编码格式
        html = req.text  # 获得网页源代码
        bf = BeautifulSoup(html, 'lxml')  # 利用BeautifulSoup进行解析
        fanye = []
        fanye_code = bf.find_all('a', href = re.compile('column/node_29'))
        for i in fanye_code:
            print(i)
        if len(fanye_code) == 0:
            fanye.append(target)
        else:
            for x in fanye_code:  # 找到目标a标签
                link = x.get('href')  # 提取链接
                if link:
                    print('fanye_url', link)
                    fanye.append(link)  # 存入列表
        fanye.pop()
        return fanye

    def get_news_url(self, fanye):
        req = requests.get(fanye, headers=self.header)  # 获取对象
        req.encoding = "utf-8"  # 设置编码格式
        html = req.text  # 获得网页源代码
        bf = BeautifulSoup(html, 'lxml')  # 利用BeautifulSoup进行解析
        news_linklist = []  # 存入翻页地址
        news_url = bf.find_all('a', href = re.compile('http://www.takungpao.com/news/232108/'))
        for x in news_url:  # 找到目标a标签
            link = x.get('href')  # 提取链接
            if link:
                # print('news_url:', link)
                news_linklist.append(link)  # 存入列表
        return news_linklist

    def getnewsdetail(self, newsurl):  # 获得单页的新闻内容
        result = {}
        res = requests.get(url=newsurl, headers=self.header)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.text, 'html.parser')
        # result['title'] = soup.select('.main-title')[0].text  # 标题
        # timesource = soup.select('.date-source span')[0].text
        # result['time'] = datetime.strptime(timesource, '%Y年%m月%d日 %H:%M').strftime('%Y-%m-%d')  # 时间
        # result['place'] = soup.select('.source')[0].text  # 来源
        article = []  # 获取文章内容
        for p in soup.select('p'):
            article.append(p.text.strip())
        articleall = ' '.join(article)
        print('article:', articleall)
        result['article'] = articleall
        # result['editor'] = soup.select('#article p')[-1].text.strip('责任编辑:')  # 获取作者姓名
        return articleall


if __name__ == "__main__":
    starttime = datetime.now()
    url_chain = 'http://www.takungpao.com.hk/hongkong/'
    url_chain_2 = 'http://www.takungpao.com.hk/mainland/'
    url_chain_3 = 'http://www.takungpao.com.hk/taiwan/'
    url_chain_4 = 'http://www.takungpao.com.hk/international/'
    url_chain_5 = 'http://www.takungpao.com.hk/finance/'
    url_chain_6 = 'http://www.takungpao.com.hk/culture/'
    url_chain_7 = 'http://www.takungpao.com.hk/sports/'
    url_chain_8 = 'http://www.takungpao.com.hk/ent/'
    url_chain_9 = 'http://renwen.takungpao.com/html/100/'
    url_chain_10 = 'http://renwen.takungpao.com/html/800/'
    url_chain_11 = 'http://www.takungpao.com/news/index.html'
    url_chain_12 = 'http://www.takungpao.com/news/232108/index.html'
    dl = downloader(url_chain_10)
    print('start getting fanye_url...')
    # fanye = dl.get_fanye_url(dl.target)

    # for x in fanye:
    #     b = dl.get_fanye_url(x)
    #     for w in b:  # 这一个循环的目的是获取翻页链接的同时去重
    #         if w not in fanye:
    #             if w == 'https://www.zaobao.com/news/china?page=2':
    #                 break
    #             fanye.append(w)

                # print("翻页链接" + w)
    fanye = []
    for i in range(2, 11):
        url = 'http://www.takungpao.com/news/232108/' + str(i) + '.html'
        fanye.append(url)
    print('fanye_lenth:', len(fanye))

    print('start getting news url...')
    # news_url = ['https://www.sinchew.com.my/content/content_2098401.html']
    news_url = []
    for x in fanye:
        a = dl.get_news_url(x)
        for w in a:  # 这一个循环的目的是获取新闻链接的同时去重
            if w not in news_url:
                news_url.append(w)
                print("新闻地址" + w)
    print('news_url_lenth:', len(news_url))

    print('start getting news content...')
    count = 1
    name = 'china.txt'
    txt_dir = '../data/xiang_gang03.txt'
    for news_index in news_url:
        count += 1
        news_content = dl.getnewsdetail(news_index)
        print('The ' + str(count) + ' news_content is writting...')
        f = open(txt_dir, 'a', encoding='utf-8')
        f.write('The ' + str(count) + ' news:' + '\n')
        f.write(news_content + '\n'*3)
    endtime = datetime.now()
    cost_time = endtime - starttime
    f = open(txt_dir, 'a', encoding='utf-8')
    f.write('\n'*3 + str(cost_time))
    print('The cost time :', cost_time)

        # json_str = json.dumps(xinwen, ensure_ascii=False, indent=1)
        # with open(txt_dir, 'w') as json_file:
        #     json_file.write(json_str)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值