Looking for words from a list of words in a sentence
Problem
Given a Dictionary with user_id
and a list of alert_words
(words and phrases too look for in an sentence) and a string content
. We have to look if the alert_words
appears in the content
and return the list of user_ids
who's alert_words
appears in the content
Example
input = { 1 : ['how are you'], 2 : ['you are'] }
content = 'hello, how are you'
output = [1]
user_id
= 1 has 'how are you' while user_id
= 2 has the words but not in the correct order so only user 1 is returned.
Solution
I'm using Google's pygtrie implementation of Trie
data structure to achieve this. [pygtrie documentation]
Algorithm:
- For each word in the given sentence
- Check if the word is a key, if yes add it to the list of user_ids
- check if the word has a subtrie i.e. that current word is the starting of a alert_word. So add it to another set
alert_phrases
- for each word in
alert_phrases
we check if we can extent with the current word and do the same set of operations if it is a key/subtrie
Code
import pygtrie
from typing import Dict, List, Set
def build_trie(realm_alert_words : Dict[int, List[int]]) -> pygtrie.StringTrie:
trie = pygtrie.StringTrie()
for user_id, words in realm_alert_words.items():
for word in words:
alert_word = trie._separator.join(word.split())
if trie.has_key(alert_word):
user_ids_for_word = trie.get(alert_word)
user_ids_for_word.update([user_id])
else:
trie[alert_word] = set([user_id])
return trie
def get_user_ids_with_alert_words(trie : pygtrie.StringTrie, content : str) -> Set[int]:
"""Returns the list of user_id's who have alert_words present in content"""
content_words = content.split()
alert_phrases = set()
user_ids_in_messages = set()
for possible_alert_word in content_words:
#has_node returns 1(HAS_VALUE) if the exact key is found, 2(HAS_SUBTRIE) if the key is a sub trie,
# 3 if it's both 0 if it's none
#https://pygtrie.readthedocs.io/en/latest/#pygtrie.Trie.has_node
alert_word_in_trie = trie.has_node(possible_alert_word)
if alert_word_in_trie & pygtrie.Trie.HAS_VALUE:
user_ids = trie.get(possible_alert_word)
user_ids_in_messages.update(user_ids)
deep_copy_alert_phrases = set(alert_phrases)
# Check if extending the phrases with the current word in content is a subtrie or key. And
# Remove the word if it is not a subtrie as we are interested only in continuos words in the content
for alert_phrase in deep_copy_alert_phrases:
alert_phrases.remove(alert_phrase)
extended_alert_phrase = alert_phrase + trie._separator + possible_alert_word
alert_phrase_in_trie = trie.has_node(extended_alert_phrase)
if alert_phrase_in_trie & pygtrie.Trie.HAS_VALUE:
user_ids = trie.get(extended_alert_phrase)
user_ids_in_messages.update(user_ids)
if alert_phrase_in_trie & pygtrie.Trie.HAS_SUBTRIE:
alert_phrases.add(extended_alert_phrase)
if alert_word_in_trie & pygtrie.Trie.HAS_SUBTRIE:
alert_phrases.add(possible_alert_word)
return user_ids_in_messages
Tests
input = {1 : ['hello'], 7 : ['this possible'], 2 : ['hello'], 3 : ['hello'], 5 : ['how are you'], 6 : ['hey']}
alert_word_trie = build_trie(input)
content = 'hello how is this possible how are you doing today'
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1, 2, 3, 5, 7]))
input = {1 : ['provisioning', 'Prod deployment'], 2 : ['test', 'Prod'], 3 : ['prod'], 4 : ['deployment'] }
alert_word_trie = build_trie(input)
content = 'Hello, everyone. Prod deployment has been completed'
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1, 2, 4]))
input = {1 : ['provisioning/log.txt'] }
alert_word_trie = build_trie(input)
content = 'Hello, everyone. Errors logged at provisioning/log.txt '
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1]))
The two methods are part of a larger classes which have some not so related code. You get a list of user_id's and their alert_words from the database and you process every message content
based on the trie already build up.
This is for a chat application so frequency of running the get_user_id_with_alert_words
is high when the build_trie
is relatively less since it will be cached.
python algorithm python-3.x trie
add a comment |
Problem
Given a Dictionary with user_id
and a list of alert_words
(words and phrases too look for in an sentence) and a string content
. We have to look if the alert_words
appears in the content
and return the list of user_ids
who's alert_words
appears in the content
Example
input = { 1 : ['how are you'], 2 : ['you are'] }
content = 'hello, how are you'
output = [1]
user_id
= 1 has 'how are you' while user_id
= 2 has the words but not in the correct order so only user 1 is returned.
Solution
I'm using Google's pygtrie implementation of Trie
data structure to achieve this. [pygtrie documentation]
Algorithm:
- For each word in the given sentence
- Check if the word is a key, if yes add it to the list of user_ids
- check if the word has a subtrie i.e. that current word is the starting of a alert_word. So add it to another set
alert_phrases
- for each word in
alert_phrases
we check if we can extent with the current word and do the same set of operations if it is a key/subtrie
Code
import pygtrie
from typing import Dict, List, Set
def build_trie(realm_alert_words : Dict[int, List[int]]) -> pygtrie.StringTrie:
trie = pygtrie.StringTrie()
for user_id, words in realm_alert_words.items():
for word in words:
alert_word = trie._separator.join(word.split())
if trie.has_key(alert_word):
user_ids_for_word = trie.get(alert_word)
user_ids_for_word.update([user_id])
else:
trie[alert_word] = set([user_id])
return trie
def get_user_ids_with_alert_words(trie : pygtrie.StringTrie, content : str) -> Set[int]:
"""Returns the list of user_id's who have alert_words present in content"""
content_words = content.split()
alert_phrases = set()
user_ids_in_messages = set()
for possible_alert_word in content_words:
#has_node returns 1(HAS_VALUE) if the exact key is found, 2(HAS_SUBTRIE) if the key is a sub trie,
# 3 if it's both 0 if it's none
#https://pygtrie.readthedocs.io/en/latest/#pygtrie.Trie.has_node
alert_word_in_trie = trie.has_node(possible_alert_word)
if alert_word_in_trie & pygtrie.Trie.HAS_VALUE:
user_ids = trie.get(possible_alert_word)
user_ids_in_messages.update(user_ids)
deep_copy_alert_phrases = set(alert_phrases)
# Check if extending the phrases with the current word in content is a subtrie or key. And
# Remove the word if it is not a subtrie as we are interested only in continuos words in the content
for alert_phrase in deep_copy_alert_phrases:
alert_phrases.remove(alert_phrase)
extended_alert_phrase = alert_phrase + trie._separator + possible_alert_word
alert_phrase_in_trie = trie.has_node(extended_alert_phrase)
if alert_phrase_in_trie & pygtrie.Trie.HAS_VALUE:
user_ids = trie.get(extended_alert_phrase)
user_ids_in_messages.update(user_ids)
if alert_phrase_in_trie & pygtrie.Trie.HAS_SUBTRIE:
alert_phrases.add(extended_alert_phrase)
if alert_word_in_trie & pygtrie.Trie.HAS_SUBTRIE:
alert_phrases.add(possible_alert_word)
return user_ids_in_messages
Tests
input = {1 : ['hello'], 7 : ['this possible'], 2 : ['hello'], 3 : ['hello'], 5 : ['how are you'], 6 : ['hey']}
alert_word_trie = build_trie(input)
content = 'hello how is this possible how are you doing today'
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1, 2, 3, 5, 7]))
input = {1 : ['provisioning', 'Prod deployment'], 2 : ['test', 'Prod'], 3 : ['prod'], 4 : ['deployment'] }
alert_word_trie = build_trie(input)
content = 'Hello, everyone. Prod deployment has been completed'
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1, 2, 4]))
input = {1 : ['provisioning/log.txt'] }
alert_word_trie = build_trie(input)
content = 'Hello, everyone. Errors logged at provisioning/log.txt '
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1]))
The two methods are part of a larger classes which have some not so related code. You get a list of user_id's and their alert_words from the database and you process every message content
based on the trie already build up.
This is for a chat application so frequency of running the get_user_id_with_alert_words
is high when the build_trie
is relatively less since it will be cached.
python algorithm python-3.x trie
add a comment |
Problem
Given a Dictionary with user_id
and a list of alert_words
(words and phrases too look for in an sentence) and a string content
. We have to look if the alert_words
appears in the content
and return the list of user_ids
who's alert_words
appears in the content
Example
input = { 1 : ['how are you'], 2 : ['you are'] }
content = 'hello, how are you'
output = [1]
user_id
= 1 has 'how are you' while user_id
= 2 has the words but not in the correct order so only user 1 is returned.
Solution
I'm using Google's pygtrie implementation of Trie
data structure to achieve this. [pygtrie documentation]
Algorithm:
- For each word in the given sentence
- Check if the word is a key, if yes add it to the list of user_ids
- check if the word has a subtrie i.e. that current word is the starting of a alert_word. So add it to another set
alert_phrases
- for each word in
alert_phrases
we check if we can extent with the current word and do the same set of operations if it is a key/subtrie
Code
import pygtrie
from typing import Dict, List, Set
def build_trie(realm_alert_words : Dict[int, List[int]]) -> pygtrie.StringTrie:
trie = pygtrie.StringTrie()
for user_id, words in realm_alert_words.items():
for word in words:
alert_word = trie._separator.join(word.split())
if trie.has_key(alert_word):
user_ids_for_word = trie.get(alert_word)
user_ids_for_word.update([user_id])
else:
trie[alert_word] = set([user_id])
return trie
def get_user_ids_with_alert_words(trie : pygtrie.StringTrie, content : str) -> Set[int]:
"""Returns the list of user_id's who have alert_words present in content"""
content_words = content.split()
alert_phrases = set()
user_ids_in_messages = set()
for possible_alert_word in content_words:
#has_node returns 1(HAS_VALUE) if the exact key is found, 2(HAS_SUBTRIE) if the key is a sub trie,
# 3 if it's both 0 if it's none
#https://pygtrie.readthedocs.io/en/latest/#pygtrie.Trie.has_node
alert_word_in_trie = trie.has_node(possible_alert_word)
if alert_word_in_trie & pygtrie.Trie.HAS_VALUE:
user_ids = trie.get(possible_alert_word)
user_ids_in_messages.update(user_ids)
deep_copy_alert_phrases = set(alert_phrases)
# Check if extending the phrases with the current word in content is a subtrie or key. And
# Remove the word if it is not a subtrie as we are interested only in continuos words in the content
for alert_phrase in deep_copy_alert_phrases:
alert_phrases.remove(alert_phrase)
extended_alert_phrase = alert_phrase + trie._separator + possible_alert_word
alert_phrase_in_trie = trie.has_node(extended_alert_phrase)
if alert_phrase_in_trie & pygtrie.Trie.HAS_VALUE:
user_ids = trie.get(extended_alert_phrase)
user_ids_in_messages.update(user_ids)
if alert_phrase_in_trie & pygtrie.Trie.HAS_SUBTRIE:
alert_phrases.add(extended_alert_phrase)
if alert_word_in_trie & pygtrie.Trie.HAS_SUBTRIE:
alert_phrases.add(possible_alert_word)
return user_ids_in_messages
Tests
input = {1 : ['hello'], 7 : ['this possible'], 2 : ['hello'], 3 : ['hello'], 5 : ['how are you'], 6 : ['hey']}
alert_word_trie = build_trie(input)
content = 'hello how is this possible how are you doing today'
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1, 2, 3, 5, 7]))
input = {1 : ['provisioning', 'Prod deployment'], 2 : ['test', 'Prod'], 3 : ['prod'], 4 : ['deployment'] }
alert_word_trie = build_trie(input)
content = 'Hello, everyone. Prod deployment has been completed'
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1, 2, 4]))
input = {1 : ['provisioning/log.txt'] }
alert_word_trie = build_trie(input)
content = 'Hello, everyone. Errors logged at provisioning/log.txt '
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1]))
The two methods are part of a larger classes which have some not so related code. You get a list of user_id's and their alert_words from the database and you process every message content
based on the trie already build up.
This is for a chat application so frequency of running the get_user_id_with_alert_words
is high when the build_trie
is relatively less since it will be cached.
python algorithm python-3.x trie
Problem
Given a Dictionary with user_id
and a list of alert_words
(words and phrases too look for in an sentence) and a string content
. We have to look if the alert_words
appears in the content
and return the list of user_ids
who's alert_words
appears in the content
Example
input = { 1 : ['how are you'], 2 : ['you are'] }
content = 'hello, how are you'
output = [1]
user_id
= 1 has 'how are you' while user_id
= 2 has the words but not in the correct order so only user 1 is returned.
Solution
I'm using Google's pygtrie implementation of Trie
data structure to achieve this. [pygtrie documentation]
Algorithm:
- For each word in the given sentence
- Check if the word is a key, if yes add it to the list of user_ids
- check if the word has a subtrie i.e. that current word is the starting of a alert_word. So add it to another set
alert_phrases
- for each word in
alert_phrases
we check if we can extent with the current word and do the same set of operations if it is a key/subtrie
Code
import pygtrie
from typing import Dict, List, Set
def build_trie(realm_alert_words : Dict[int, List[int]]) -> pygtrie.StringTrie:
trie = pygtrie.StringTrie()
for user_id, words in realm_alert_words.items():
for word in words:
alert_word = trie._separator.join(word.split())
if trie.has_key(alert_word):
user_ids_for_word = trie.get(alert_word)
user_ids_for_word.update([user_id])
else:
trie[alert_word] = set([user_id])
return trie
def get_user_ids_with_alert_words(trie : pygtrie.StringTrie, content : str) -> Set[int]:
"""Returns the list of user_id's who have alert_words present in content"""
content_words = content.split()
alert_phrases = set()
user_ids_in_messages = set()
for possible_alert_word in content_words:
#has_node returns 1(HAS_VALUE) if the exact key is found, 2(HAS_SUBTRIE) if the key is a sub trie,
# 3 if it's both 0 if it's none
#https://pygtrie.readthedocs.io/en/latest/#pygtrie.Trie.has_node
alert_word_in_trie = trie.has_node(possible_alert_word)
if alert_word_in_trie & pygtrie.Trie.HAS_VALUE:
user_ids = trie.get(possible_alert_word)
user_ids_in_messages.update(user_ids)
deep_copy_alert_phrases = set(alert_phrases)
# Check if extending the phrases with the current word in content is a subtrie or key. And
# Remove the word if it is not a subtrie as we are interested only in continuos words in the content
for alert_phrase in deep_copy_alert_phrases:
alert_phrases.remove(alert_phrase)
extended_alert_phrase = alert_phrase + trie._separator + possible_alert_word
alert_phrase_in_trie = trie.has_node(extended_alert_phrase)
if alert_phrase_in_trie & pygtrie.Trie.HAS_VALUE:
user_ids = trie.get(extended_alert_phrase)
user_ids_in_messages.update(user_ids)
if alert_phrase_in_trie & pygtrie.Trie.HAS_SUBTRIE:
alert_phrases.add(extended_alert_phrase)
if alert_word_in_trie & pygtrie.Trie.HAS_SUBTRIE:
alert_phrases.add(possible_alert_word)
return user_ids_in_messages
Tests
input = {1 : ['hello'], 7 : ['this possible'], 2 : ['hello'], 3 : ['hello'], 5 : ['how are you'], 6 : ['hey']}
alert_word_trie = build_trie(input)
content = 'hello how is this possible how are you doing today'
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1, 2, 3, 5, 7]))
input = {1 : ['provisioning', 'Prod deployment'], 2 : ['test', 'Prod'], 3 : ['prod'], 4 : ['deployment'] }
alert_word_trie = build_trie(input)
content = 'Hello, everyone. Prod deployment has been completed'
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1, 2, 4]))
input = {1 : ['provisioning/log.txt'] }
alert_word_trie = build_trie(input)
content = 'Hello, everyone. Errors logged at provisioning/log.txt '
result = get_user_ids_with_alert_words(alert_word_trie, content)
assert(result == set([1]))
The two methods are part of a larger classes which have some not so related code. You get a list of user_id's and their alert_words from the database and you process every message content
based on the trie already build up.
This is for a chat application so frequency of running the get_user_id_with_alert_words
is high when the build_trie
is relatively less since it will be cached.
python algorithm python-3.x trie
python algorithm python-3.x trie
asked 20 mins ago
thebenmanthebenman
501315
501315
add a comment |
add a comment |
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211370%2flooking-for-words-from-a-list-of-words-in-a-sentence%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211370%2flooking-for-words-from-a-list-of-words-in-a-sentence%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown