Python Program Generating N-Gram Language Model

I am pretty new to python, and I am writing this program to randomly generate sentences based on the n-gram language. It takes me very long to run this with the large input file I have, so it is very hard for me to check my work. I guess my problem is that, when I need 2 words as the history and based on the count of words appear after the 2 words, I generate the next word. And it takes very long and hard for me to do that for some reason.

Any suggestion would be really helpful.

def N_Gram(corpus, n):

corpus = ''.join(corpus)

corpus = corpus.split(' ')

output = {}

for i in range(len(corpus)-n+1):

    g = ' '.join(corpus[i:i+n])

    output.setdefault(g,0)

    output[g] += 1

return output



def Uni_Generation():

corpus = ReadFile()

uni = N_Gram(corpus, 1)

print(uni)

final = unsmoothed_totalcount(uni)

print(final)

sentence_list =  # the list of 5 sentences

for b in xrange(0,5):

    sentence = '<s> '

    while sentence.split()[-1] != '</s>': #last word is not </s>

        sentence += return_random_selected_item(final, uni)

        if len(sentence.split()) >= 15 : # if the length of sentence is more than 15

            sentence += '</s>'

        #print(sentence)

    sentence = post_processing(sentence)

    sentence_list.append(sentence)

return sentence_list



def unsmoothed_totalcount(n_gram_dict):

#get total count of words

keyList = n_gram_dict.keys()

final = 0

for x in n_gram_dict:

    for i,a in enumerate(keyList):

        if a == x:

            prev_word = keyList[i-1]

    prev_count = n_gram_dict[prev_word] # get the previous word count

    final += prev_count

return final



def find_next_word(N_Gram_dic, histo):

#find all bigram start with histo, use the count and do everything

corpus = ReadFile()

n_count_dic = {}

uni_dic= N_Gram(corpus, 1)

if histo.count(' ') == 0: # there is no history for it

    #print("get here")

    n_count_dic = uni_dic

else:

    keyList = N_Gram_dic.keys()

    #print(keyList)

    leng = histo.count(' ') + 1

    #print("histo is ")

    #print(histo)

    for a in keyList:

        #word = ' '.join(a.split()[:leng])

        #print("word is " + word)

        if(histo == a):



    # if histo == ' '.join(a.split()[:leng]):

            n_count_dic[a] = N_Gram_dic[a]

#print(n_count_dic)

final = unsmoothed_totalcount(n_count_dic)

#print("i did my best")

#print(final)

return return_random_selected_item(final, n_count_dic)



def return_random_selected_item(total_count, n_count_dict):

print(total_count)

r = random.randint(1,total_count)

for x in n_count_dict:

    f1 = n_count_dict[x]

    if r - f1 <= 0 : # if the word choosen is not ending token

        return x.split()[-1] + ' '

    if r > f1:

        r = r - f1

and the main part is here

def N_Gram_Generation(n):

corpus = ReadFile()

if n == 1:

    return Uni_Generation()



uni_gram = N_Gram(corpus, 1)

n_Gram = N_Gram(corpus, n)

N_m1_Gram = N_Gram(corpus, n-1)

#final = unsmoothed_totalcount(N_m1_Gram)

sentence_list =  # the list of 5 sentences



for b in xrange(0,5):

    sentence = '<s> '

    while len(sentence.split()) < n:

            word = find_next_word(uni_gram, sentence.split()[-1])

            sentence += word

    while sentence.split()[-1] != '</s>':

        # list = sentence.split()

        # his= list[-(n-1):]

        # histor = ' '.join(his)

        # print(histor)

        next_word = find_next_word(n_Gram, sentence.split()[-1])

        if next_word != '</s>':

            sentence += next_word

        if next_word == '</s>':

            break

        if len(sentence.split()) >= 15 : # if the length of sentence is more than 15

            sentence += '</s>'



    sentence = post_processing(sentence)

    sentence_list.append(sentence)

return sentence_list

Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
Thanks!

asked 4 mins ago

Yuhe Zhu

233

add a comment |

Any suggestion would be really helpful.

def N_Gram(corpus, n):

corpus = ''.join(corpus)

corpus = corpus.split(' ')

output = {}

for i in range(len(corpus)-n+1):

    g = ' '.join(corpus[i:i+n])

    output.setdefault(g,0)

    output[g] += 1

return output



def Uni_Generation():

corpus = ReadFile()

uni = N_Gram(corpus, 1)

print(uni)

final = unsmoothed_totalcount(uni)

print(final)

sentence_list =  # the list of 5 sentences

for b in xrange(0,5):

    sentence = '<s> '

    while sentence.split()[-1] != '</s>': #last word is not </s>

        sentence += return_random_selected_item(final, uni)

        if len(sentence.split()) >= 15 : # if the length of sentence is more than 15

            sentence += '</s>'

        #print(sentence)

    sentence = post_processing(sentence)

    sentence_list.append(sentence)

return sentence_list



def unsmoothed_totalcount(n_gram_dict):

#get total count of words

keyList = n_gram_dict.keys()

final = 0

for x in n_gram_dict:

    for i,a in enumerate(keyList):

        if a == x:

            prev_word = keyList[i-1]

    prev_count = n_gram_dict[prev_word] # get the previous word count

    final += prev_count

return final



def find_next_word(N_Gram_dic, histo):

#find all bigram start with histo, use the count and do everything

corpus = ReadFile()

n_count_dic = {}

uni_dic= N_Gram(corpus, 1)

if histo.count(' ') == 0: # there is no history for it

    #print("get here")

    n_count_dic = uni_dic

else:

    keyList = N_Gram_dic.keys()

    #print(keyList)

    leng = histo.count(' ') + 1

    #print("histo is ")

    #print(histo)

    for a in keyList:

        #word = ' '.join(a.split()[:leng])

        #print("word is " + word)

        if(histo == a):



    # if histo == ' '.join(a.split()[:leng]):

            n_count_dic[a] = N_Gram_dic[a]

#print(n_count_dic)

final = unsmoothed_totalcount(n_count_dic)

#print("i did my best")

#print(final)

return return_random_selected_item(final, n_count_dic)



def return_random_selected_item(total_count, n_count_dict):

print(total_count)

r = random.randint(1,total_count)

for x in n_count_dict:

    f1 = n_count_dict[x]

    if r - f1 <= 0 : # if the word choosen is not ending token

        return x.split()[-1] + ' '

    if r > f1:

        r = r - f1

and the main part is here

def N_Gram_Generation(n):

corpus = ReadFile()

if n == 1:

    return Uni_Generation()



uni_gram = N_Gram(corpus, 1)

n_Gram = N_Gram(corpus, n)

N_m1_Gram = N_Gram(corpus, n-1)

#final = unsmoothed_totalcount(N_m1_Gram)

sentence_list =  # the list of 5 sentences



for b in xrange(0,5):

    sentence = '<s> '

    while len(sentence.split()) < n:

            word = find_next_word(uni_gram, sentence.split()[-1])

            sentence += word

    while sentence.split()[-1] != '</s>':

        # list = sentence.split()

        # his= list[-(n-1):]

        # histor = ' '.join(his)

        # print(histor)

        next_word = find_next_word(n_Gram, sentence.split()[-1])

        if next_word != '</s>':

            sentence += next_word

        if next_word == '</s>':

            break

        if len(sentence.split()) >= 15 : # if the length of sentence is more than 15

            sentence += '</s>'



    sentence = post_processing(sentence)

    sentence_list.append(sentence)

return sentence_list

Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
Thanks!

asked 4 mins ago

Yuhe Zhu

233

add a comment |

Any suggestion would be really helpful.

def N_Gram(corpus, n):

corpus = ''.join(corpus)

corpus = corpus.split(' ')

output = {}

for i in range(len(corpus)-n+1):

    g = ' '.join(corpus[i:i+n])

    output.setdefault(g,0)

    output[g] += 1

return output



def Uni_Generation():

corpus = ReadFile()

uni = N_Gram(corpus, 1)

print(uni)

final = unsmoothed_totalcount(uni)

print(final)

sentence_list =  # the list of 5 sentences

for b in xrange(0,5):

    sentence = '<s> '

    while sentence.split()[-1] != '</s>': #last word is not </s>

        sentence += return_random_selected_item(final, uni)

        if len(sentence.split()) >= 15 : # if the length of sentence is more than 15

            sentence += '</s>'

        #print(sentence)

    sentence = post_processing(sentence)

    sentence_list.append(sentence)

return sentence_list



def unsmoothed_totalcount(n_gram_dict):

#get total count of words

keyList = n_gram_dict.keys()

final = 0

for x in n_gram_dict:

    for i,a in enumerate(keyList):

        if a == x:

            prev_word = keyList[i-1]

    prev_count = n_gram_dict[prev_word] # get the previous word count

    final += prev_count

return final



def find_next_word(N_Gram_dic, histo):

#find all bigram start with histo, use the count and do everything

corpus = ReadFile()

n_count_dic = {}

uni_dic= N_Gram(corpus, 1)

if histo.count(' ') == 0: # there is no history for it

    #print("get here")

    n_count_dic = uni_dic

else:

    keyList = N_Gram_dic.keys()

    #print(keyList)

    leng = histo.count(' ') + 1

    #print("histo is ")

    #print(histo)

    for a in keyList:

        #word = ' '.join(a.split()[:leng])

        #print("word is " + word)

        if(histo == a):



    # if histo == ' '.join(a.split()[:leng]):

            n_count_dic[a] = N_Gram_dic[a]

#print(n_count_dic)

final = unsmoothed_totalcount(n_count_dic)

#print("i did my best")

#print(final)

return return_random_selected_item(final, n_count_dic)



def return_random_selected_item(total_count, n_count_dict):

print(total_count)

r = random.randint(1,total_count)

for x in n_count_dict:

    f1 = n_count_dict[x]

    if r - f1 <= 0 : # if the word choosen is not ending token

        return x.split()[-1] + ' '

    if r > f1:

        r = r - f1

and the main part is here

def N_Gram_Generation(n):

corpus = ReadFile()

if n == 1:

    return Uni_Generation()



uni_gram = N_Gram(corpus, 1)

n_Gram = N_Gram(corpus, n)

N_m1_Gram = N_Gram(corpus, n-1)

#final = unsmoothed_totalcount(N_m1_Gram)

sentence_list =  # the list of 5 sentences



for b in xrange(0,5):

    sentence = '<s> '

    while len(sentence.split()) < n:

            word = find_next_word(uni_gram, sentence.split()[-1])

            sentence += word

    while sentence.split()[-1] != '</s>':

        # list = sentence.split()

        # his= list[-(n-1):]

        # histor = ' '.join(his)

        # print(histor)

        next_word = find_next_word(n_Gram, sentence.split()[-1])

        if next_word != '</s>':

            sentence += next_word

        if next_word == '</s>':

            break

        if len(sentence.split()) >= 15 : # if the length of sentence is more than 15

            sentence += '</s>'



    sentence = post_processing(sentence)

    sentence_list.append(sentence)

return sentence_list

Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
Thanks!

asked 4 mins ago

Yuhe Zhu

233

Any suggestion would be really helpful.

def N_Gram(corpus, n):

corpus = ''.join(corpus)

corpus = corpus.split(' ')

output = {}

for i in range(len(corpus)-n+1):

    g = ' '.join(corpus[i:i+n])

    output.setdefault(g,0)

    output[g] += 1

return output



def Uni_Generation():

corpus = ReadFile()

uni = N_Gram(corpus, 1)

print(uni)

final = unsmoothed_totalcount(uni)

print(final)

sentence_list =  # the list of 5 sentences

for b in xrange(0,5):

    sentence = '<s> '

    while sentence.split()[-1] != '</s>': #last word is not </s>

        sentence += return_random_selected_item(final, uni)

        if len(sentence.split()) >= 15 : # if the length of sentence is more than 15

            sentence += '</s>'

        #print(sentence)

    sentence = post_processing(sentence)

    sentence_list.append(sentence)

return sentence_list



def unsmoothed_totalcount(n_gram_dict):

#get total count of words

keyList = n_gram_dict.keys()

final = 0

for x in n_gram_dict:

    for i,a in enumerate(keyList):

        if a == x:

            prev_word = keyList[i-1]

    prev_count = n_gram_dict[prev_word] # get the previous word count

    final += prev_count

return final



def find_next_word(N_Gram_dic, histo):

#find all bigram start with histo, use the count and do everything

corpus = ReadFile()

n_count_dic = {}

uni_dic= N_Gram(corpus, 1)

if histo.count(' ') == 0: # there is no history for it

    #print("get here")

    n_count_dic = uni_dic

else:

    keyList = N_Gram_dic.keys()

    #print(keyList)

    leng = histo.count(' ') + 1

    #print("histo is ")

    #print(histo)

    for a in keyList:

        #word = ' '.join(a.split()[:leng])

        #print("word is " + word)

        if(histo == a):



    # if histo == ' '.join(a.split()[:leng]):

            n_count_dic[a] = N_Gram_dic[a]

#print(n_count_dic)

final = unsmoothed_totalcount(n_count_dic)

#print("i did my best")

#print(final)

return return_random_selected_item(final, n_count_dic)



def return_random_selected_item(total_count, n_count_dict):

print(total_count)

r = random.randint(1,total_count)

for x in n_count_dict:

    f1 = n_count_dict[x]

    if r - f1 <= 0 : # if the word choosen is not ending token

        return x.split()[-1] + ' '

    if r > f1:

        r = r - f1

and the main part is here

def N_Gram_Generation(n):

corpus = ReadFile()

if n == 1:

    return Uni_Generation()



uni_gram = N_Gram(corpus, 1)

n_Gram = N_Gram(corpus, n)

N_m1_Gram = N_Gram(corpus, n-1)

#final = unsmoothed_totalcount(N_m1_Gram)

sentence_list =  # the list of 5 sentences



for b in xrange(0,5):

    sentence = '<s> '

    while len(sentence.split()) < n:

            word = find_next_word(uni_gram, sentence.split()[-1])

            sentence += word

    while sentence.split()[-1] != '</s>':

        # list = sentence.split()

        # his= list[-(n-1):]

        # histor = ' '.join(his)

        # print(histor)

        next_word = find_next_word(n_Gram, sentence.split()[-1])

        if next_word != '</s>':

            sentence += next_word

        if next_word == '</s>':

            break

        if len(sentence.split()) >= 15 : # if the length of sentence is more than 15

            sentence += '</s>'



    sentence = post_processing(sentence)

    sentence_list.append(sentence)

return sentence_list

Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
Thanks!

python

asked 4 mins ago

Yuhe Zhu

233

asked 4 mins ago

Yuhe Zhu

233

asked 4 mins ago

Yuhe Zhu

233

asked 4 mins ago

Yuhe Zhu

233

asked 4 mins ago

Yuhe Zhu

233

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f212668%2fpython-program-generating-n-gram-language-model%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Tukukkk