2021-03-04

最新推荐文章于 2021-11-10 17:26:45 发布
ckcnhs
最新推荐文章于 2021-11-10 17:26:45 发布
阅读量95
点赞数
分类专栏：笔记
本文链接：https://blog.csdn.net/ckcnhs/article/details/114372632
版权
笔记专栏收录该内容
1 篇文章 0 订阅
订阅专栏
一段代码
'''
By Zhiyuan Yao (zyao9@stevens.edu)
- Oct 26, 2018
'''


from bs4 import BeautifulSoup
import requests
from urllib.request import urlretrieve
from datetime import datetime as dt
import os, re
from pathlib import Path


import codecs
# f=codecs.open("./data/AAPL/20151028.htm", 'r')





data_format = '10-k'
DOMIN = 'https://www.sec.gov'



class TenKDownloader:
    def __init__(self, cik, start_date, end_date):
        if type(cik) == str:
            cik = [cik]
        elif type(cik) == list:
            for i, ele in enumerate(cik):
                assert type(ele)==str, f'cik at index {i} is not string: %s'%type(ele)
        else:
            raise TypeError('CIK should be string or list of string, input type is %s'%type(cik))
        self.CIK = cik
        self.start_date = dt.strptime(start_date,'%Y%m%d')
        self.end_date = dt.strptime(end_date,'%Y%m%d')
        self.all_url = {}
        self.cwd = os.getcwd()

    def download(self, target = './data', reset_flag=False):
        os.chdir(self.cwd)
        os.chdir(target)
        for c in self.CIK:
            try:
                if reset_flag:
                    result = self._search_each(c)
                else:
                    if c in self.all_url:
                        continue
                    else:
                        result = self._search_each(c)
            except ValueError as info:
                print(info)
                continue
            try:
                os.mkdir(c)
            except FileExistsError:
                pass
            os.chdir(f'./{c}')
            for each in result:
                print(f'Downloading {c}:{each[0]} {each[1]}')
                filename = each[0]+str(each[each[1].rfind('.'):])
                # urlretrieve(each[1], filename)
                # GZ change
                urlretrieve(each[1])
                print('File saved in {}'.format(os.getcwd()+'\\'+filename))
            self.all_url[c] = result
            os.chdir('..')
        os.chdir('..')

    def _search_each(self, cik):
        assert cik in self.CIK, '%s is not in CIK list'%cik
        url = f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type=10-k&dateb=&owner=exclude&count=40'
        search_page = requests.get(url)
        assert search_page.status_code == 200, 'request code for search page: %s' % search_page.status_code
        search_head = BeautifulSoup(search_page.content, 'html.parser')
        search_result = search_head.select('table.tableFile2 tr')
        if len(search_result)==0:
            raise ValueError(f'Result for {cik} is not available, {url}')
        search_result.pop(0)
        start_idx, end_idx = self._search_date([self._get(item, 'date') for item in search_result], self.start_date, self.end_date)
        # print(start_idx, end_idx)
        result = []
        for i in range(start_idx, end_idx+1):
            if self._get(search_result[i], 'type')!='10-K':
                # print(self._get(search_result[i], 'date').strftime('%Y%m%d'))
                continue
            date = self._get(search_result[i], 'date').strftime('%Y%m%d')
            sub_url = DOMIN + search_result[i].find('a', attrs={"id": "documentsbutton"})['href']
            company_page = requests.get(sub_url)
            assert  company_page.status_code == 200, 'request code for company page: %s' % company_page.status_code
            company_head = BeautifulSoup(company_page.content, 'html.parser')
            file_table = company_head.select('table.tableFile')[0].select('tr')
            file_table.pop(0)
            for item in file_table:
                if '10-K' in item.select('td')[3].contents[0]:
                    break
            doc_url = item.select('td a')[0]['href']
            result.append((date, DOMIN+doc_url))
        return result

    def _get(self, item, info):
        if info == 'date':
            date = item.select('td')[3].contents[0]
            ret = dt.strptime(date,'%Y-%m-%d')
        elif info == 'url':
            ret = DOMIN + item.find('a', attrs={"id": "documentsbutton"})['href']
        elif info == 'type':
            ret = item.select('td')[0].contents[0]
        else:
            raise NotImplementedError
        return ret

    def _search_date(self, ls, start, end):
        h, t = ls[-1], ls[0]
        n = len(ls)
        assert start <= t and end >= h, f'Available time interval: {h} to {t}, input: {start} to {end}'
        # print(h,t)
        if start >= h:
            ei, _ = self._bsearch_dec(ls, start)
        else:
            ei = len(ls)-1
        if end <= t:
            _, si = self._bsearch_dec(ls, end)
        else:
            si = 0
        return si, ei

    def _bsearch_dec(self, ls, point):
        a = 0
        b = len(ls)
        while b-a > 1:
            tmp = int((a+b)/2)
            if ls[tmp] >= point:
                a = tmp
            else:
                b = tmp
        return a,b


class TenKScraper:
    def __init__(self, section, next_section):
        self.all_section = [str(i) for i in range(1, 16)] + ['1A', '1B', '7A', '9A', '9B']
        section = re.findall(r'\d.*\w*', section.upper())[0]
        next_section = re.findall(r'\d.*\w*', next_section.upper())[0]
        if section  not in self.all_section:
            raise ValueError(f'Section: {section} is not avaiable, avaiable section: {self.all_section}')
        if next_section  not in self.all_section:
            raise ValueError(f'Section: {next_section} is not avaiable, avaiable section: {self.all_section}')
        self.section = 'Item ' + section
        self.next_section = 'Item ' + next_section

    def scrape(self, input_path, output_path):
        with open(input_path, 'rb') as input_file:
            page = input_file.read()  # <===Read the HTML file into Python
            # Pre-processing the html content by removing extra white space and combining then into one line.
            page = page.strip()  # <=== remove white space at the beginning and end
            page = page.replace(b'\n', b' ')  # <===replace the \n (new line) character with space
            page = page.replace(b'\r', b'')  # <===replace the \r (carriage returns -if you're on windows) with space
            page = page.replace(b'&nbsp;',
                                b' ')  # <===replace "&nbsp;" (a special character for space in HTML) with space.
            page = page.replace(b'&#160;',
                                b' ')  # <===replace "&#160;" (a special character for space in HTML) with space.

            while b'  ' in page:
                page = page.replace(b'  ', b' ')  # <===remove extra space

            # Using regular expression to extract texts that match a pattern

            # Define pattern for regular expression.
            # The following patterns find ITEM 1 and ITEM 1A as diplayed as subtitles
            # (.+?) represents everything between the two subtitles
            # If you want to extract something else, here is what you should change

            # Define a list of potential patterns to find ITEM 1 and ITEM 1A as subtitles
            p1 = bytes(r'bold;\">\s*' + self.section + r'\.(.+?)bold;\">\s*' + self.next_section + r'\.',
                       encoding='utf-8')
            p2 = bytes(r'b>\s*' + self.section + r'\.(.+?)b>\s*' + self.next_section + r'\.', encoding='utf-8')
            p3 = bytes(r'' + self.section + r'\.\s*<\/b>(.+?)' + self.next_section + r'\.\s*<\/b>', encoding='utf-8')
            p4 = bytes(r'' + self.section + r'\.\s*[^<>]+\.\s*<\/b(.+?)' + self.next_section + r'\.\s*[^<>]+\.\s*<\/b',
                       encoding='utf-8')
            p5 = bytes(r'b>\s*<font[^>]+>\s*' + self.section + r'\.(.+?)b>\s*<font[^>]+>\s*' + self.next_section + r'\.', encoding='utf-8')
            p6 = bytes(r'' + self.section.upper() + r'\.\s*<\/b>(.+?)' + self.next_section.upper() + r'\.\s*<\/b>', encoding='utf-8')

            p7 = bytes(r'underline;\">\s*' + self.section + r'\<\/font>(.+?)underline;\">\s*'+ self.next_section + r'\.\s*\<\/font>',encoding = 'utf-8')
            p8 = bytes(r'underline;\">\s*' + self.section + r'\.\<\/font>(.+?)underline;\">\s*'+ self.next_section + r'\.\s*\<\/font>',encoding = 'utf-8')
            p9 = bytes(r'<font[^>]+>\s*' + self.section + r'\:(.+?)\<font[^>]+>\s*'+self.next_section + r'\:\s*',encoding = 'utf-8')
            p10 = bytes(r'<font[^>]+>\s*' + self.section + r'\.\<\/font>(.+?)\<font[^>]+>\s*' + self.next_section + r'\.',encoding = 'utf-8')
            p11 = bytes(r'' + self.section + r'\.(.+?)<font[^>]+>\s*' + self.next_section + r'\.\<\/font>',encoding = 'utf-8')
            p12 = bytes(r'b>\s*<font[^>]+>\s*' + self.section + r'(.+?)b>\s*<font[^>]+>\s*' + self.next_section + r'\s*\<\/font>', encoding='utf-8')
            p13 = bytes(r'' + self.section + r'\.\s*[^<>]+\.\s*<\/b(.+?)b>\s*' + self.next_section + r'\.',
                       encoding='utf-8')
            regexs = (
                p1,  # <===pattern 1: with an attribute bold before the item subtitle
                p2,  # <===pattern 2: with a tag <b> before the item subtitle
                p3,  # <===pattern 3: with a tag <\b> after the item subtitle
                p4,  # <===pattern 4: with a tag <\b> after the item+description subtitle
                p5,  # <===pattern 5: with a tag <b><font ...> before the item subtitle
                p6,  # <===pattern 6: with a tag <\b> after the item subtitle (ITEM XX.<\b>)
                p7,
                p8,
                p9,
                p10,
                p11,
                p12,
                p13)

            # Now we try to see if a match can be found...
            for regex in regexs:
                match = re.search(regex, page,
                                  flags=re.IGNORECASE)  # <===search for the pattern in HTML using re.search from the re package. Ignore cases.

                # If a match exist....
                if match:
                    # Now we have the extracted content still in an HTML format
                    # We now turn it into a beautiful soup object
                    # so that we can remove the html tags and only keep the texts

                    soup = BeautifulSoup(match.group(1),
                                         "html.parser")  # <=== match.group(1) returns the texts inside the parentheses (.*?)

                    # soup.text removes the html tags and only keep the texts
                    rawText = soup.text.encode('utf8')  # <=== you have to change the encoding the unicodes

                    # remove space at the beginning and end and the subtitle "business" at the beginning
                    # ^ matches the beginning of the text
                    # outText = re.sub(b"^business\s*", b"", rawText.strip(), flags=re.IGNORECASE)
                    Path(output_path).touch()
                    with open(output_path, "wb") as output_file:
                        output_file.write(rawText)

                    break  # <=== if a match is found, we break the for loop. Otherwise the for loop continues

        if match is None:
            print(f'No matched sections: {self.section}, {self.next_section} found in {input_path}.')
            return None
        else:
            return rawText

if __name__=='__main__':
    pass
    # print(os.getcwd())
    # os.chdir('./BIA660_project')
    # company_name = ['AAPL', 'GOOG']
    # downloader = TenKDownloader(company_name, '20150101','20180101')
    # downloader.download()

    scraper = TenKScraper('Item 1A', 'Item 1B')  # scrape text start from Item 1A, and stop by Item 1B
    scraper2 = TenKScraper('Item 7', 'Item 8')
    scraper.scrape('./data/1326160/20110225.htm', './data/txt/test.txt')
    scraper2.scrape('./data/1326160/20110225.htm', './data/txt/test2.txt')