TextMining/markov_analysis.py at master · TehyaStockman/TextMining · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import sys
import string
import random
import pickle
from pickle import dump, load
from text_processing import processing_file, any_lowercase

# global variables
suffix_map = {}        # map from prefixes to a list of suffixes
prefix = ()            # current tuple of words


def process_file(filename, order=2):
    """Reads a file and performs Markov analysis.

    filename: string
    order: integer number of words in the prefix

    Returns: map from prefix to list of possible suffixes.
    """
    fp = open(filename)
    new_fp = processing_file(filename)


    for line in new_fp.split('\n'):

        for word in line.rstrip().split():
            process_word(word, order)


def skip_gutenberg_header(fp):
    """Reads from fp until it finds the line that ends the header.

    fp: open file object
    """
    for line in fp:
        if line.startswith('*** START OF THIS'):
            break


def process_word(word, order=2):
    """Processes each word.

    word: string
    order: integer

    During the first few iterations, store up the words;
    after that, add entries to the dictionary.
    """
    global prefix
    if len(prefix) < order:
        prefix += (word,)
        return


    if prefix in suffix_map:
        suffix_map[prefix].append(word)

    else:

        suffix_map[prefix] = [word]

    prefix = shift(prefix, word)


def random_text(n=100):
    story = open('fairy_tales_grimm_random.txt', 'w')
    """Generates random wordsfrom the analyzed text.

    Starts with a random prefix from the dictionary.

    n: number of words to generate
    """
    # choose a random prefix (not weighted by frequency)
    start = random.choice(suffix_map.keys())

    word_string = ''
    for i in range(n):
        suffixes = suffix_map.get(start, None)
        if suffixes == None:
            # if the start isn't in map, we got to the end of the
            # original text, so we have to start again.
            random_text(n-i)
            return

        # choose a random suffix
        word = random.choice(suffixes)
        start = shift(start, word)
        word_string += (word) + ' '


    story.write(word_string)
    story.close()


def shift(t, word):
    """Forms a new tuple by removing the head and adding word to the tail.

    t: tuple of strings
    word: string

    Returns: tuple of strings
    """
    return t[1:] + (word,)


def main(name, filename='', n=1000, order=2, *args):

    try:
        n = int(n)
        order = int(order)
    except:
        print'Usage: randomtext.py filename [# of words] [prefix length]'
    else:
        process_file(filename, order)
        random_text(n)


random_story = main('grimm_fairytales', 'grimm_fairytales.txt', n=1000, order= 2)


#process_file('emma.txt')
#if __name__ == '__main__':
#    main(*sys.argv)