Python Program Generating N-Gram Language Model
$begingroup$
I am pretty new to python, and I am writing this program to randomly generate sentences based on the n-gram language. It takes me very long to run this with the large input file I have, so it is very hard for me to check my work. I guess my problem is that, when I need 2 words as the history and based on the count of words appear after the 2 words, I generate the next word. And it takes very long and hard for me to do that for some reason.
Any suggestion would be really helpful.
def N_Gram(corpus, n):
corpus = ''.join(corpus)
corpus = corpus.split(' ')
output = {}
for i in range(len(corpus)-n+1):
g = ' '.join(corpus[i:i+n])
output.setdefault(g,0)
output[g] += 1
return output
def Uni_Generation():
corpus = ReadFile()
uni = N_Gram(corpus, 1)
print(uni)
final = unsmoothed_totalcount(uni)
print(final)
sentence_list = # the list of 5 sentences
for b in xrange(0,5):
sentence = '<s> '
while sentence.split()[-1] != '</s>': #last word is not </s>
sentence += return_random_selected_item(final, uni)
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'
#print(sentence)
sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list
def unsmoothed_totalcount(n_gram_dict):
#get total count of words
keyList = n_gram_dict.keys()
final = 0
for x in n_gram_dict:
for i,a in enumerate(keyList):
if a == x:
prev_word = keyList[i-1]
prev_count = n_gram_dict[prev_word] # get the previous word count
final += prev_count
return final
def find_next_word(N_Gram_dic, histo):
#find all bigram start with histo, use the count and do everything
corpus = ReadFile()
n_count_dic = {}
uni_dic= N_Gram(corpus, 1)
if histo.count(' ') == 0: # there is no history for it
#print("get here")
n_count_dic = uni_dic
else:
keyList = N_Gram_dic.keys()
#print(keyList)
leng = histo.count(' ') + 1
#print("histo is ")
#print(histo)
for a in keyList:
#word = ' '.join(a.split()[:leng])
#print("word is " + word)
if(histo == a):
# if histo == ' '.join(a.split()[:leng]):
n_count_dic[a] = N_Gram_dic[a]
#print(n_count_dic)
final = unsmoothed_totalcount(n_count_dic)
#print("i did my best")
#print(final)
return return_random_selected_item(final, n_count_dic)
def return_random_selected_item(total_count, n_count_dict):
print(total_count)
r = random.randint(1,total_count)
for x in n_count_dict:
f1 = n_count_dict[x]
if r - f1 <= 0 : # if the word choosen is not ending token
return x.split()[-1] + ' '
if r > f1:
r = r - f1
and the main part is here
def N_Gram_Generation(n):
corpus = ReadFile()
if n == 1:
return Uni_Generation()
uni_gram = N_Gram(corpus, 1)
n_Gram = N_Gram(corpus, n)
N_m1_Gram = N_Gram(corpus, n-1)
#final = unsmoothed_totalcount(N_m1_Gram)
sentence_list = # the list of 5 sentences
for b in xrange(0,5):
sentence = '<s> '
while len(sentence.split()) < n:
word = find_next_word(uni_gram, sentence.split()[-1])
sentence += word
while sentence.split()[-1] != '</s>':
# list = sentence.split()
# his= list[-(n-1):]
# histor = ' '.join(his)
# print(histor)
next_word = find_next_word(n_Gram, sentence.split()[-1])
if next_word != '</s>':
sentence += next_word
if next_word == '</s>':
break
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'
sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list
Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
Thanks!
python
$endgroup$
add a comment |
$begingroup$
I am pretty new to python, and I am writing this program to randomly generate sentences based on the n-gram language. It takes me very long to run this with the large input file I have, so it is very hard for me to check my work. I guess my problem is that, when I need 2 words as the history and based on the count of words appear after the 2 words, I generate the next word. And it takes very long and hard for me to do that for some reason.
Any suggestion would be really helpful.
def N_Gram(corpus, n):
corpus = ''.join(corpus)
corpus = corpus.split(' ')
output = {}
for i in range(len(corpus)-n+1):
g = ' '.join(corpus[i:i+n])
output.setdefault(g,0)
output[g] += 1
return output
def Uni_Generation():
corpus = ReadFile()
uni = N_Gram(corpus, 1)
print(uni)
final = unsmoothed_totalcount(uni)
print(final)
sentence_list = # the list of 5 sentences
for b in xrange(0,5):
sentence = '<s> '
while sentence.split()[-1] != '</s>': #last word is not </s>
sentence += return_random_selected_item(final, uni)
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'
#print(sentence)
sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list
def unsmoothed_totalcount(n_gram_dict):
#get total count of words
keyList = n_gram_dict.keys()
final = 0
for x in n_gram_dict:
for i,a in enumerate(keyList):
if a == x:
prev_word = keyList[i-1]
prev_count = n_gram_dict[prev_word] # get the previous word count
final += prev_count
return final
def find_next_word(N_Gram_dic, histo):
#find all bigram start with histo, use the count and do everything
corpus = ReadFile()
n_count_dic = {}
uni_dic= N_Gram(corpus, 1)
if histo.count(' ') == 0: # there is no history for it
#print("get here")
n_count_dic = uni_dic
else:
keyList = N_Gram_dic.keys()
#print(keyList)
leng = histo.count(' ') + 1
#print("histo is ")
#print(histo)
for a in keyList:
#word = ' '.join(a.split()[:leng])
#print("word is " + word)
if(histo == a):
# if histo == ' '.join(a.split()[:leng]):
n_count_dic[a] = N_Gram_dic[a]
#print(n_count_dic)
final = unsmoothed_totalcount(n_count_dic)
#print("i did my best")
#print(final)
return return_random_selected_item(final, n_count_dic)
def return_random_selected_item(total_count, n_count_dict):
print(total_count)
r = random.randint(1,total_count)
for x in n_count_dict:
f1 = n_count_dict[x]
if r - f1 <= 0 : # if the word choosen is not ending token
return x.split()[-1] + ' '
if r > f1:
r = r - f1
and the main part is here
def N_Gram_Generation(n):
corpus = ReadFile()
if n == 1:
return Uni_Generation()
uni_gram = N_Gram(corpus, 1)
n_Gram = N_Gram(corpus, n)
N_m1_Gram = N_Gram(corpus, n-1)
#final = unsmoothed_totalcount(N_m1_Gram)
sentence_list = # the list of 5 sentences
for b in xrange(0,5):
sentence = '<s> '
while len(sentence.split()) < n:
word = find_next_word(uni_gram, sentence.split()[-1])
sentence += word
while sentence.split()[-1] != '</s>':
# list = sentence.split()
# his= list[-(n-1):]
# histor = ' '.join(his)
# print(histor)
next_word = find_next_word(n_Gram, sentence.split()[-1])
if next_word != '</s>':
sentence += next_word
if next_word == '</s>':
break
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'
sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list
Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
Thanks!
python
$endgroup$
add a comment |
$begingroup$
I am pretty new to python, and I am writing this program to randomly generate sentences based on the n-gram language. It takes me very long to run this with the large input file I have, so it is very hard for me to check my work. I guess my problem is that, when I need 2 words as the history and based on the count of words appear after the 2 words, I generate the next word. And it takes very long and hard for me to do that for some reason.
Any suggestion would be really helpful.
def N_Gram(corpus, n):
corpus = ''.join(corpus)
corpus = corpus.split(' ')
output = {}
for i in range(len(corpus)-n+1):
g = ' '.join(corpus[i:i+n])
output.setdefault(g,0)
output[g] += 1
return output
def Uni_Generation():
corpus = ReadFile()
uni = N_Gram(corpus, 1)
print(uni)
final = unsmoothed_totalcount(uni)
print(final)
sentence_list = # the list of 5 sentences
for b in xrange(0,5):
sentence = '<s> '
while sentence.split()[-1] != '</s>': #last word is not </s>
sentence += return_random_selected_item(final, uni)
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'
#print(sentence)
sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list
def unsmoothed_totalcount(n_gram_dict):
#get total count of words
keyList = n_gram_dict.keys()
final = 0
for x in n_gram_dict:
for i,a in enumerate(keyList):
if a == x:
prev_word = keyList[i-1]
prev_count = n_gram_dict[prev_word] # get the previous word count
final += prev_count
return final
def find_next_word(N_Gram_dic, histo):
#find all bigram start with histo, use the count and do everything
corpus = ReadFile()
n_count_dic = {}
uni_dic= N_Gram(corpus, 1)
if histo.count(' ') == 0: # there is no history for it
#print("get here")
n_count_dic = uni_dic
else:
keyList = N_Gram_dic.keys()
#print(keyList)
leng = histo.count(' ') + 1
#print("histo is ")
#print(histo)
for a in keyList:
#word = ' '.join(a.split()[:leng])
#print("word is " + word)
if(histo == a):
# if histo == ' '.join(a.split()[:leng]):
n_count_dic[a] = N_Gram_dic[a]
#print(n_count_dic)
final = unsmoothed_totalcount(n_count_dic)
#print("i did my best")
#print(final)
return return_random_selected_item(final, n_count_dic)
def return_random_selected_item(total_count, n_count_dict):
print(total_count)
r = random.randint(1,total_count)
for x in n_count_dict:
f1 = n_count_dict[x]
if r - f1 <= 0 : # if the word choosen is not ending token
return x.split()[-1] + ' '
if r > f1:
r = r - f1
and the main part is here
def N_Gram_Generation(n):
corpus = ReadFile()
if n == 1:
return Uni_Generation()
uni_gram = N_Gram(corpus, 1)
n_Gram = N_Gram(corpus, n)
N_m1_Gram = N_Gram(corpus, n-1)
#final = unsmoothed_totalcount(N_m1_Gram)
sentence_list = # the list of 5 sentences
for b in xrange(0,5):
sentence = '<s> '
while len(sentence.split()) < n:
word = find_next_word(uni_gram, sentence.split()[-1])
sentence += word
while sentence.split()[-1] != '</s>':
# list = sentence.split()
# his= list[-(n-1):]
# histor = ' '.join(his)
# print(histor)
next_word = find_next_word(n_Gram, sentence.split()[-1])
if next_word != '</s>':
sentence += next_word
if next_word == '</s>':
break
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'
sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list
Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
Thanks!
python
$endgroup$
I am pretty new to python, and I am writing this program to randomly generate sentences based on the n-gram language. It takes me very long to run this with the large input file I have, so it is very hard for me to check my work. I guess my problem is that, when I need 2 words as the history and based on the count of words appear after the 2 words, I generate the next word. And it takes very long and hard for me to do that for some reason.
Any suggestion would be really helpful.
def N_Gram(corpus, n):
corpus = ''.join(corpus)
corpus = corpus.split(' ')
output = {}
for i in range(len(corpus)-n+1):
g = ' '.join(corpus[i:i+n])
output.setdefault(g,0)
output[g] += 1
return output
def Uni_Generation():
corpus = ReadFile()
uni = N_Gram(corpus, 1)
print(uni)
final = unsmoothed_totalcount(uni)
print(final)
sentence_list = # the list of 5 sentences
for b in xrange(0,5):
sentence = '<s> '
while sentence.split()[-1] != '</s>': #last word is not </s>
sentence += return_random_selected_item(final, uni)
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'
#print(sentence)
sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list
def unsmoothed_totalcount(n_gram_dict):
#get total count of words
keyList = n_gram_dict.keys()
final = 0
for x in n_gram_dict:
for i,a in enumerate(keyList):
if a == x:
prev_word = keyList[i-1]
prev_count = n_gram_dict[prev_word] # get the previous word count
final += prev_count
return final
def find_next_word(N_Gram_dic, histo):
#find all bigram start with histo, use the count and do everything
corpus = ReadFile()
n_count_dic = {}
uni_dic= N_Gram(corpus, 1)
if histo.count(' ') == 0: # there is no history for it
#print("get here")
n_count_dic = uni_dic
else:
keyList = N_Gram_dic.keys()
#print(keyList)
leng = histo.count(' ') + 1
#print("histo is ")
#print(histo)
for a in keyList:
#word = ' '.join(a.split()[:leng])
#print("word is " + word)
if(histo == a):
# if histo == ' '.join(a.split()[:leng]):
n_count_dic[a] = N_Gram_dic[a]
#print(n_count_dic)
final = unsmoothed_totalcount(n_count_dic)
#print("i did my best")
#print(final)
return return_random_selected_item(final, n_count_dic)
def return_random_selected_item(total_count, n_count_dict):
print(total_count)
r = random.randint(1,total_count)
for x in n_count_dict:
f1 = n_count_dict[x]
if r - f1 <= 0 : # if the word choosen is not ending token
return x.split()[-1] + ' '
if r > f1:
r = r - f1
and the main part is here
def N_Gram_Generation(n):
corpus = ReadFile()
if n == 1:
return Uni_Generation()
uni_gram = N_Gram(corpus, 1)
n_Gram = N_Gram(corpus, n)
N_m1_Gram = N_Gram(corpus, n-1)
#final = unsmoothed_totalcount(N_m1_Gram)
sentence_list = # the list of 5 sentences
for b in xrange(0,5):
sentence = '<s> '
while len(sentence.split()) < n:
word = find_next_word(uni_gram, sentence.split()[-1])
sentence += word
while sentence.split()[-1] != '</s>':
# list = sentence.split()
# his= list[-(n-1):]
# histor = ' '.join(his)
# print(histor)
next_word = find_next_word(n_Gram, sentence.split()[-1])
if next_word != '</s>':
sentence += next_word
if next_word == '</s>':
break
if len(sentence.split()) >= 15 : # if the length of sentence is more than 15
sentence += '</s>'
sentence = post_processing(sentence)
sentence_list.append(sentence)
return sentence_list
Sorry if I was not clear or anything. I am really lack of sleep right now for this assignment. The readfile and postprocessing part is all good so I will not put that here.
Thanks!
python
python
asked 4 mins ago
Yuhe ZhuYuhe Zhu
233
233
add a comment |
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f212668%2fpython-program-generating-n-gram-language-model%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f212668%2fpython-program-generating-n-gram-language-model%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown