@@ -52,6 +52,19 @@ def prepare_feature_extraction():
5252
5353 print ("All files for extracting word and paragraph embeddings are present." )
5454
55+ def prepare_word_embeddings ():
56+
57+ word_vectors_f = open ('../sherlock/features/glove.6B.50d.txt' , encoding = 'utf-8' )
58+ word_to_embedding = {}
59+
60+ for w in word_vectors_f :
61+
62+ term , vector = w .strip ().split (' ' , 1 )
63+ vector = np .array (vector .split (' ' ), dtype = float )
64+ word_to_embedding [term ] = vector
65+
66+ return word_to_embedding
67+
5568
5669def convert_string_lists_to_lists (
5770 data : Union [pd .DataFrame , pd .Series ],
@@ -116,6 +129,8 @@ def extract_features(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
116129 """
117130 prepare_feature_extraction ()
118131
132+ word_to_embedding = prepare_word_embeddings ()
133+
119134 features_list = []
120135 df_par = pd .DataFrame ()
121136 n_samples = 1000
@@ -137,7 +152,7 @@ def extract_features(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
137152
138153 f = OrderedDict (
139154 list (extract_bag_of_characters_features (raw_sample ).items ()) +
140- list (extract_word_embeddings_features (raw_sample ).items ()) +
155+ list (extract_word_embeddings_features (raw_sample , word_to_embedding ).items ()) +
141156 list (extract_bag_of_words_features (raw_sample , n_values ).items ())
142157 )
143158 features_list .append (f )
0 commit comments