first decision tree try, new sentiment analysis plot

SojaSurfer · SojaSurfer · commit 67445cd182b2 · 2025-01-10T21:00:08.000+01:00
diff --git a/analysis/decisionTree.py b/analysis/decisionTree.py
@@ -0,0 +1,106 @@
+from pathlib import Path
+import sys
+import string
+
+import sklearn.linear_model
+import sklearn.tree
+import sklearn.dummy
+from sklearn.model_selection import train_test_split
+from nltk import FreqDist
+from nltk.corpus import stopwords
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import graphviz
+from analysis import loadMetadata, concatTables
+
+
+
+STOPWORDS = stopwords.words() + list(string.punctuation) + ["'s", "--", 'applause', "’re", "--ms", "--the"]
+
+
+def getWordFrequencySpeaker(amount: int = 10) -> pd.DataFrame:
+    metadataDF: pd.DataFrame = loadMetadata()
+    totalDF = pd.DataFrame()
+
+    root = Path('corpus') / 'tables'
+
+    for index, row in tqdm(metadataDF.iterrows(), ncols=80, total=len(metadataDF)):
+
+        tablePath = root / row['linkTables']
+
+        tableDF = pd.read_csv(tablePath)
+
+
+        fdist = FreqDist(cleanTokens(tableDF))
+        mostCommon = fdist.most_common(amount)
+        
+        df = pd.DataFrame(data=dict(mostCommon) | {'PERIOD': row['period'], 'SPEAKER': row['speaker']}, index=[index])        
+
+        totalDF = totalDF.combine_first(df)
+
+
+    totalDF.fillna(0, inplace=True)
+    totalDF = totalDF.convert_dtypes()
+
+    return totalDF
+
+
+def cleanTokens(tableDF:pd.DataFrame) -> list:
+    return [w.lower() for w in tableDF['LEMMA'] if not w in STOPWORDS and w.isalpha()]
+
+
+def renderDecisionTree(tree, y) -> None:
+    sklearn.tree.export_graphviz(tree, out_file='decisiontree.dot', 
+                    feature_names=df.columns, 
+                    class_names=y.unique(), 
+                    filled=True, rounded=True, 
+                    special_characters=True)
+
+    with open('decisiontree.dot') as f:
+        dot_graph = f.read()
+
+    graph = graphviz.Source(dot_graph)
+    graph.render('decisionTree', format='png', cleanup=True)
+
+    return None
+
+
+def trainTreeClassifier(df:pd.DataFrame) -> sklearn.tree.DecisionTreeClassifier:
+
+
+    
+
+
+
+    return treeClf
+
+
+if __name__ == '__main__':
+    # totalDF = getWordFrequencySpeaker()
+    # totalDF.to_parquet('wordFreq.parquet', index=False)
+
+    df = pd.read_parquet('wordFreq.parquet')
+    df = df.convert_dtypes()
+    df.pop('PERIOD')
+
+    y = df.pop('SPEAKER')
+    train, test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)
+
+    treeClf = sklearn.tree.DecisionTreeClassifier(max_depth=9)
+
+    treeClf.fit(train, y_train)
+
+    accuracy = treeClf.score(test, y_test)
+    print(f'Accuracy: {accuracy:.2f}')
+
+    for strategy in ['most_frequent', 'prior', 'stratified', 'uniform']:
+        dummyClf = sklearn.dummy.DummyClassifier(strategy=strategy)
+        dummyClf.fit(train, y_train)
+        accuracy = dummyClf.score(test, y_test)
+        print(f'Accuracy Dummy: {accuracy:.2f}')
+
+    # renderDecisionTree(treeClf, y)
+
+
+
diff --git a/analysis/sentimentAnalysis/sentimentAnalysisQuantiles.png b/analysis/sentimentAnalysis/sentimentAnalysisQuantiles.png
diff --git a/analysis/sentiments.py b/analysis/sentiments.py
@@ -293,7 +293,6 @@ def sentimentViolinPeriod(show: bool = True) -> None:
     return None
 
 
-
 def sentimentTTestPeriod(show: bool = True) -> None:
     metadataDF: pd.DataFrame = loadMetadata()
 
@@ -331,6 +330,86 @@ def sentimentTTestPeriod(show: bool = True) -> None:
     return ttestResult
 
 
+def sentimentPerText(show: bool = True) -> None:
+    metadataDF: pd.DataFrame = loadMetadata()
+
+    root = Path('corpus') / 'tables'
+    n_quantiles = 5
+    resultDF = pd.DataFrame(columns=list(range(n_quantiles)) + ['speaker'])
+
+
+    layout = {'side': ['negative', 'positive'],
+              'color': ['blue', 'red']}
+    yaxisRange = (-0.5, 1)
+
+    if False:
+        for i, row in tqdm(metadataDF.iterrows(), ncols=80, total=len(metadataDF)):
+            tablePath = root / row['linkTables']
+
+            df = pd.read_csv(tablePath)
+
+
+            df = df[['SENTENCE_ID', 'SENTIMENT_SENTENCE']].groupby('SENTENCE_ID').agg('mean')
+
+            # Add quantile groups based on the 'index' column
+            df['quantile'] = pd.qcut(df.index, q=n_quantiles, labels=False)
+
+            # Calculate the mean sentiment for each quantile
+            quantiles = df.groupby('quantile')['SENTIMENT_SENTENCE'].mean()
+
+            # Create a new DataFrame from the quantiles and add a new column 'test'
+            quantiles_row = quantiles.T
+            quantiles_row['speaker'] = row['speaker']
+
+            # Append the new DataFrame to the result DataFrame
+            resultDF.loc[len(resultDF)] = quantiles_row
+
+
+
+        #print(resultDF)
+    
+    # resultDF = resultDF.groupby('speaker').mean().T
+    # resultDF.rename(index={'speaker': 'Quantiles'}, inplace=True)
+    # resultDF.reset_index(inplace=True)
+    # resultDF.to_csv('test2.csv')
+
+
+    resultDF = pd.read_csv('test2.csv')
+    print(resultDF)
+    print(resultDF.index, resultDF.columns)
+
+    fig = go.Figure()
+
+    # Add a line trace for each column in resultDF except 'Quantiles'
+    for column in resultDF.columns[1:]:
+        fig.add_trace(go.Scatter(
+            x=resultDF['Quantiles'] + 1,
+            y=resultDF[column],
+            mode='lines',
+            name=column
+        ))
+
+    # Update layout
+    fig.update_layout(
+        title='Mean Sentiment per Speaker in Quantiles',
+        xaxis_title='Quantiles',
+        yaxis_title='Mean Sentiment',
+        legend=dict(
+            orientation='h',  # Set the legend orientation to horizontal
+            x=0.5,  # Center the legend horizontally
+            y=1.05,  # Position the legend above the plot
+            xanchor='center',  # Anchor the legend horizontally at the center
+            yanchor='top'  # Anchor the legend vertically at the bottom
+        )
+    )
+
+    fig.show(height=800, width=1200)
+    fig.write_image(Path('analysis/sentimentAnalysis/sentimentAnalysisQuantiles.png'), 
+                    height=800, width=1200)
+    
+    return
+
+
 def formatStatistics(ttest:scipy.stats._stats_py.TtestResult, cohensD_:float) -> str:
     pValue = ttest.pvalue
     
@@ -367,6 +446,11 @@ def cohensD(series1:pd.Series, series2:pd.Series) -> float:
 
     # sentimentBoxplotSpeaker()
     # sentimentBoxplotYear()
-    sentimentViolinPeriod(show=False)
+    # sentimentViolinPeriod(show=False)
+    sentimentPerText(show=True)
+
+    # ttestResult = sentimentTTestPeriod(show=True)
+
+
+
 
-    # ttestResult = sentimentTTestPeriod(show=True)
diff --git a/analysis/wordFrequency.py b/analysis/wordFrequency.py
@@ -155,6 +155,7 @@ def plotWordFrequencySpeaker(amount: int = 10, show: bool = False) -> None:
                 posTbl = tokenTbl[tokenTbl['POS'].isin(tags)]
                 fdist = FreqDist(posTbl['LEMMA'].str.lower())
                 mostCommon = fdist.most_common(amount)
+   
 
                 fig.add_trace(go.Bar(
                     x=[w[0] for w in mostCommon],