词频直方图
算词频
import nltk
from nltk.corpus import gutenberg
gutenberg.fileids()
['austen-emma.txt',
'austen-persuasion.txt',
'austen-sense.txt',
'bible-kjv.txt',
'blake-poems.txt',
'bryant-stories.txt',
'burgess-busterbrown.txt',
'carroll-alice.txt',
'chesterton-ball.txt',
'chesterton-brown.txt',
'chesterton-thursday.txt',
'edgeworth-parents.txt',
'melville-moby_dick.txt',
'milton-paradise.txt',
'shakespeare-caesar.txt',
'shakespeare-hamlet.txt',
'shakespeare-macbeth.txt',
'whitman-leaves.txt']
emma=nltk.corpus.gutenberg.words('austen-emma.txt')
type(emma)
nltk.corpus.reader.util.StreamBackedCorpusView
len(emma)
192427
len(emma)/len(set(emma))
24.63538599411087
len(set(emma))
7811
sorted(set(emma))
['!',
'!"',
'!"--',
"!'",
"!'--",
'!)--',
'!--',
'!--"',
'!--(',
'!--`',
'"',
'"\'',
'"--',
'"`',
'&',
"'",
"'--",
"';",
'(',
')',
'),',
')--',
').',
').--',
');--',
',',
',"',
',"--',
",'",
',\'"',
',)',
',--',
',--"',
'-',
'--',
'--"',
'--(',
'--,',
'----',
'----------,',
"--------.'",
'--.',
'--."',
"--.'",
'--:',
'--`',
'.',
'."',
'."--',
".'",
'.\'"',
".'--",
".'--`",
'.)',
'.,',
'.,"',
".,'",
'.--',
'.--"',
'.--`',
'.]',
'000',
'10',
'1816',
'23rd',
'24th',
'26th',
'28th',
'7th',
'8th',
':',
':"',
':"--',
":'",
":'--",
':--',
':--"',
';',
';"',
';"--',
";'",
";'--",
';--',
';--"',
'?',
'?"',
'?"--',
'?"--"',
"?'",
'?\'"',
'?)--',
'?--',
'?--"',
'?--(',
'A',
'Abbey',
'Abbots',
'Abdy',
'Abominable',
'About',
'Absence',
'Absolute',
'Absolutely',
'Absurd',
'According',
'Accordingly',
'Acquit',
'Actually',
'Adelaide',
'Adopt',
'After',
'Agreed',
'Agricultural',
'Ah',
'Aladdin',
'Alas',
'Alderneys',
'All',
'Almane',
'Almost',
'Although',
'Altogether',
'Always',
'Am',
'Ambition',
'Amiable',
'An',
'And',
'Angry',
'Anna',
'Anne',
'Another',
'Anxious',
'Any',
'Anywhere',
'Apologies',
'Approve',
'April',
'Are',
'Arthur',
'As',
'Assured',
'Astley',
'Astonished',
'At',
'August',
'Augusta',
'Aunt',
'Austen',
'Aye',
'Bad',
'Balls',
'Baly',
'Barnes',
'Baronne',
'Bates',
'Bateses',
'Bath',
'Be',
'Bear',
'Beautiful',
'Beavers',
'Before',
'Beg',
'Behold',
'Being',
'Believe',
'Bella',
'Besides',
'Better',
'Between',
'Beyond',
'Bickerton',
'Bird',
'Birmingham',
'Birth',
'Bless',
'Blessed',
'Boarding',
'Bond',
'Books',
'Both',
'Bought',
'Box',
'Bragge',
'Bragges',
'Braithwaites',
'Break',
'Bristol',
'Broadway',
'Broadwood',
'Brother',
'Brown',
'Brunswick',
'Business',
'Busy',
'But',
'By',
'C',
'CHAPTER',
'CHARADE',
'CHURCHILL',
'Call',
'Campbell',
'Campbells',
'Can',
'Candles',
'Cannot',
'Captain',
'Caroline',
'Catherine',
'Cautious',
'Ceremonies',
'Certain',
'Certainly',
'Charming',
'Children',
'Chili',
'Christian',
'Christmas',
'Church',
'Churchill',
'Churchills',
'Chuse',
'Circumstances',
'Clara',
'Clayton',
'Clifton',
'Cobham',
'Cole',
'Coles',
'Colonel',
'Come',
'Command',
'Common',
'Compare',
'Compliments',
'Composure',
'Compressed',
'Comtesse',
'Conceive',
'Concession',
'Conjecture',
'Consider',
'Considering',
'Contrary',
'Cooper',
'Could',
'Cowper',
'Cox',
'Coxe',
'Coxes',
'Cramer',
'Cromer',
'Crown',
'DEAR',
'Dancing',
'Dating',
'Day',
'Dear',
'Dearer',
'Deceived',
'December',
'Decidedly',
'Delighted',
'Delightful',
'Depend',
'Did',
'Difference',
'Dining',
'Dinner',
'Dirty',
'Disingenuousness',
'Disputable',
'Dixon',
'Dixons',
'Do',
'Does',
'Don',
'Donwell',
'Dorking',
'Dr',
'Dreadful',
'Dublin',
'During',
'E',
'Early',
'Easter',
'Either',
'Elegant',
'Elizabeth',
'Elton',
'Eltons',
'Em',
'Emma',
'Encouragement',
'End',
'Engaged',
'England',
'English',
'Enscombe',
'Escape',
'Especially',
'Even',
'Ever',
'Every',
'Exactly',
'Excellent',
'Excellently',
'Except',
'Excepting',
'Excuse',
'Exquisite',
'Extracts',
'Extraordinary',
'Extremely',
'F',
'FINIS',
'Fairfax',
'Fancying',
'Farm',
'Farmer',
'February',
'Fetch',
'Find',
'Fine',
'Finesse',
'Five',
'For',
'Forcing',
'Ford',
'Forest',
'Former',
'Fortunate',
'Fortunately',
'Fortune',
'Four',
'Fourteen',
'Frank',
'French',
'Friday',
'From',
'Full',
'Garrick',
'General',
'Genlis',
'George',
'Gilbert',
'Gilberts',
'Give',
'Go',
'God',
'Goddard',
'Going',
'Goldsmith',
'Gone',
'Good',
'Graham',
'Grandmama',
'Grandpapa',
'Granted',
'Gratifying',
'Great',
'Green',
'Grove',
'Ha',
'Had',
'Half',
'Hall',
'Handsome',
'Hannah',
'Happier',
'Happily',
'Happy',
'Harriet',
'Harry',
'Hart_',
'Hartfield',
'Has',
'Have',
'Having',
'Hawkins',
'Hazle',
'He',
'Heaven',
'Heavens',
'Henceforward',
'Henry',
'Her',
'Here',
'Hetty',
'High',
'Highbury',
'Hill',
'Him',
'His',
'Hitherto',
'Hodges',
'Holyhead',
'How',
'However',
'Hughes',
'Hum',
'Human',
'Humph',
'Hush',
'Hymen',
'I',
'II',
'III',
'IV',
'IX',
'If',
'Ill',
'Imagine',
'Immediately',
'Impossible',
'Impropriety',
'Imprudent',
'In',
'Increase',
'Indeed',
'Indifferent',
'Indignation',
'Inn',
'Instances',
'Instead',
'Insufferable',
'Interference',
'Intimacy',
'Invite',
'Ireland',
'Irish',
'Is',
'Isabella',
'It',
'Italian',
'Its',
'JULY',
'James',
'Jane',
'January',
'Jeffereys',
'John',
'Judge',
'July',
'June',
'Just',
'K',
'Keep',
'Kindled',
'King',
'Kings',
'Kingston',
'Kitty',
'Knightley',
'Knightleys',
'La',
'Ladies',
'Lady',
'Lane',
'Langham',
'Larkins',
'Late',
'Later',
'Latterly',
'Leave',
'Let',
'Letters',
'Liable',
'Lieut',
'Like',
'Little',
'Lively',
'Living',
'London',
'Long',
'Look',
'Lord',
'Lords',
'Low',
'M',
'MADAM',
'MY',
'Ma',
'Madam',
'Madame',
'Madeira',
'Madness',
'Making',
'Man',
'Manchester',
'Manners',
'Many',
'Maple',
'March',
'Mark',
'Marriage',
'Married',
'Martin',
'Martins',
'Master',
'Matrimony',
'May',
'Me',
'Men',
'Mermaids',
'Methodical',
'Michaelmas',
'Mickleham',
'Middling',
'Midsummer',
'Might',
'Mill',
'Milmans',
'Mine',
'Miniatures',
'Miss',
'Misses',
'Mistake',
'Mistresses',
'Mitchell',
'Monday',
'More',
'Morning',
'Most',
'Mr',
'Mrs',
'Much',
'Must',
'My',
'Myself',
'Mystery',
'N',
'Name',
'Nash',
'Natural',
'Nature',
'Nay',
'Neither',
'Neptune',
'Never',
'News',
'No',
'Nobody',