Improved jupyter support for schema

holgerbrandl · holgerbrandl · commit c7ad05cf4a31 · 2021-07-17T12:10:17.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -95,4 +95,7 @@ ExcelWriteResult.xlsx
 
 # sonatype credentials
 local.properties
-*.gpg
+*.gpg
+
+# jupyter
+.ipynb_checkpoints
diff --git a/docs/devel.md b/docs/devel.md
@@ -147,3 +147,30 @@ val df = Dataframe(df.structure().target.selectWhere(column("Column Type").isEqu
 ```
 
 
+# Jupyter Integaration
+
+dev scratchpad
+
+```bash
+export KRANGL_HOME=/d/projects/misc/krangl/
+cd $KRANGL_HOME
+
+# start kernel
+cmd.exe "/K" C:\Users\brandl\Anaconda3\Scripts\activate.bat C:\Users\brandl\Anaconda3
+
+# no longer needed becaue no part of ipynb preamble
+#rm -rf ~/.ivy2/cache/com.systema/
+#rm -rf ~/.ivy2/cache/org.kalasim/
+#rm -rf ~/.ivy2/cache/com.github.holgerbrandl/kravis/
+
+#conda install -c jetbrains kotlin-jupyter-kernel
+# interactive use
+jupyter notebook --kernel=kotlin examples/jupyter/letsplot_example.ipynb
+```
+
+References
+
+* https://github.com/Kotlin/kotlin-jupyter/blob/master/docs/libraries.md
+* Many `letsplot` examples https://nbviewer.jupyter.org/github/JetBrains/lets-plot-kotlin/blob/master/docs/guide/user_guide.ipynb
+e
+
diff --git a/examples/jupyter/letsplot_example.ipynb b/examples/jupyter/letsplot_example.ipynb
diff --git a/src/main/kotlin/krangl/Extensions.kt b/src/main/kotlin/krangl/Extensions.kt
@@ -435,6 +435,8 @@ fun main(args: Array<String>) {
     //            "median" to { it["Species"].mean() }
     //        )
 
+    irisData.print()
+    irisData.schema()
     irisData.select { startsWith("Length") }.head().print()
     irisData.summarizeAt({ startsWith("Length") }) {
         add({ mean() }, "mean")
@@ -612,10 +614,10 @@ fun DataFrame.asString(
     // do the actual printing
     val sb = StringBuilder()
 
-    sb.appendln("${title}: ${nrow} x ${ncol}")
+    sb.appendLine("${title}: ${nrow} x ${ncol}")
 
     if (this is GroupedDataFrame) {
-        sb.appendln("Groups: ${by.joinToString()} [${groups.size}]")
+        sb.appendLine("Groups: ${by.joinToString()} [${groups.size}]")
     }
 
 
@@ -632,15 +634,15 @@ fun DataFrame.asString(
     if (colNames) widthTrimmed.cols.mapIndexed { index, col ->
         col.name.padStart(padding[index])
     }.joinToString("").apply {
-        sb.appendln(this)
+        sb.appendLine(this)
     }
 
 
     widthTrimmed.rows.map { it.values }.map { rowData ->
         // show null as NA when printing data
         rowData.mapIndexed { index, value ->
             valuePrinter(value).padStart(padding[index])
-        }.joinToString("").apply { sb.appendln(this) }
+        }.joinToString("").apply { sb.appendLine(this) }
     }
 
     // similar to dplyr render a summary below the table
@@ -687,34 +689,106 @@ fun List<ColSpec>.asDf() = deparseRecords { mapOf("index" to it.pos, "name" to i
 
 fun List<ColSpec>.print() = asDf().print()
 
+internal val IS_JUPYTER by lazy {
+    try {
+        // check if we are in a notebook
+        Class.forName("jupyter.kotlin.KotlinContext")
+        true
+    } catch (e: ClassNotFoundException) {
+        // it's not jupyter
+        false
+    }
+}
 
-// see https://spark.apache.org/docs/latest/sql-programming-guide.html#untyped-dataset-operations-aka-dataframe-operations
-/**
- *  Prints the schema (that is column names, types, and the first few values per column) of a dataframe to stdout.
- */
-fun DataFrame.schema(maxDigits: Int = 3, maxWidth: Int = PRINT_MAX_WIDTH) {
-    if (this is GroupedDataFrame) {
-        ungroup().schema(maxDigits, maxWidth)
-        return
+class DataFrameSchema(
+    private val df: DataFrame,
+    private val maxDigits: Int = 3,
+    private val maxWidth: Int = PRINT_MAX_WIDTH
+) {
+    override fun toString(): String {
+        val sb = StringBuilder()
+
+        with(df) {
+            val topN = this
+            sb.appendLine("DataFrame with ${nrow} observations")
+
+            val typeLabels = topN.cols.map { col -> getColumnType(col, wrapSquares = true) }
+
+            val namePadding = topN.cols.map { it.name.length }.maxOrNull() ?: 0
+            val typePadding = typeLabels.map { it.length }.maxOrNull() ?: 0
+
+            topN.cols.zip(typeLabels).forEach { (col, typeLabel) ->
+                val stringifiedVals = col.values().take(255).asSequence()
+                    .joinToMaxLengthString(maxLength = maxWidth, transform = createValuePrinter(maxDigits))
+
+
+                sb.appendLine("${col.name.padEnd(namePadding)}  ${typeLabel.padEnd(typePadding)}  $stringifiedVals")
+            }
+        }
+
+        return sb.toString()
     }
 
-    val topN = this
-    println("DataFrame with ${nrow} observations")
+    fun toHTML(): String {
+        return StringBuilder().apply {
+
+            append("<html><body>")
+
+            append("<table>")
+
+            // render header
+            append("<tr>")
+            listOf("Name", "Type", "Values").forEach { append("""<th style="text-align:left">${it}</th>""") }
+            append("</tr>")
 
-    val namePadding = topN.cols.map { it.name.length }.maxOrNull() ?: 0
+            with(df) {
+                val topN = this
 
-    val typeLabels = topN.cols.map { col -> getColumnType(col, wrapSquares = true) }
+                val typeLabels = topN.cols.map { col -> getColumnType(col, wrapSquares = true) }
 
-    val typePadding = typeLabels.map { it.length }.maxOrNull() ?: 0
 
-    topN.cols.zip(typeLabels).forEach { (col, typeLabel) ->
-        val stringifiedVals = col.values().take(255).asSequence()
-            .joinToMaxLengthString(maxLength = maxWidth, transform = createValuePrinter(maxDigits))
+                topN.cols.zip(typeLabels).forEach { (col, typeLabel) ->
+                    val stringifiedVals = col.values().take(255).asSequence()
+                        .joinToMaxLengthString(maxLength = maxWidth, transform = createValuePrinter(maxDigits))
 
-        println("${col.name.padEnd(namePadding)}  ${typeLabel.padEnd(typePadding)}  $stringifiedVals")
+//                    cols.forEach { append("""<th style=\\" text -align:left\\">${it.name}</th>""") }
+
+                    append("</tr>")
+
+                    append("""<td style="text-align:left">${col.name}</td>""")
+                    append("""<td style="text-align:left">${typeLabel}</td>""")
+                    append("""<td style="text-align:left">${stringifiedVals}</td>""")
+
+                    append("</tr>")
+                }
+            }
+
+            append("</table>")
+
+            appendLine("DataFrame with ${df.nrow} observations")
+
+            append("</body></html>")
+        }.toString()
+    }
+}
+
+// see https://spark.apache.org/docs/latest/sql-programming-guide.html#untyped-dataset-operations-aka-dataframe-operations
+/**
+ *  Prints the schema (that is column names, types, and the first few values per column) of a dataframe to stdout.
+ */
+fun DataFrame.schema(maxDigits: Int = 3, maxWidth: Int = PRINT_MAX_WIDTH): DataFrameSchema {
+    if (this is GroupedDataFrame) {
+        return ungroup().schema(maxDigits, maxWidth)
+    }
+
+    return DataFrameSchema(this, maxDigits, maxWidth).apply {
+        if (!IS_JUPYTER) {
+            println(toString())
+        }
     }
 }
 
+
 internal fun getColumnType(col: DataCol, wrapSquares: Boolean = false): String {
     return when (col) {
         is DoubleCol -> "Dbl"
diff --git a/src/main/kotlin/krangl/LetsPlot.kt b/src/main/kotlin/krangl/LetsPlot.kt
@@ -1,9 +1,9 @@
+package krangl
+
 import jetbrains.letsPlot.intern.GenericAesMapping
-import krangl.DataFrame
-import krangl.toMap
 
 /** Plot a data-frame with let-plot. To use this mapping add `implementation("org.jetbrains.lets-plot:lets-plot-kotlin-jvm:3.0.1")` or via `%use lets-plot` when using jupyter. */
-fun DataFrame.letsPlot(mapping: GenericAesMapping.() -> Unit = {}) = jetbrains.letsPlot.letsPlot(toMap())
+fun DataFrame.letsPlot(mapping: GenericAesMapping.() -> Unit = {}) = jetbrains.letsPlot.letsPlot(toMap(), mapping)
 
 
 //fun main() {
diff --git a/src/main/kotlin/krangl/integration/Integration.kt b/src/main/kotlin/krangl/integration/Integration.kt
@@ -1,5 +1,7 @@
 package krangl.integration
 
+import krangl.DataFrame
+import krangl.DataFrameSchema
 import krangl.GroupedDataFrame
 import krangl.SimpleDataFrame
 import org.jetbrains.kotlinx.jupyter.api.HTML
@@ -16,16 +18,20 @@ internal class Integration : JupyterIntegration() {
         import("krangl.*")
         render<SimpleDataFrame> { HTML(it.toHTML()) }
         render<GroupedDataFrame> { HTML(it.toHTML()) }
+        render<DataFrameSchema> { HTML(it.toHTML()) }
     }
 
-    fun krangl.DataFrame.toHTML(limit: Int = 20, truncate: Int = 50): String = with(StringBuilder()) {
+    fun DataFrame.toHTML(title: String="A DataFrame", maxRows: Int = 6, truncate: Int = 50): String = with(StringBuilder()) {
         append("<html><body>")
+
+
+
         append("<table><tr>")
 
-        cols.forEach { append("""<th style=\\" text -align:left\\">${it.name}</th>""") }
+        cols.forEach { append("""<th style="text-align:left">${it.name}</th>""") }
         append("</tr>")
 
-        rows.take(limit).forEach {
+        rows.take(maxRows).forEach {
             append("<tr>")
             it.values.map { it.toString() }.forEach {
                 val truncated = if (truncate > 0 && it.length > truncate) {
@@ -41,8 +47,20 @@ internal class Integration : JupyterIntegration() {
 
         append("</table>")
 
-        if (limit < rows.count())
-            append("<p>... only showing top $limit rows</p>")
+        // render footer
+        append("<p>")
+        if (maxRows < rows.count()){
+            append("... with ${nrow - maxRows} more rows. ")
+        }
+
+        appendLine("Shape: ${nrow} x ${ncol}. ")
+
+        if (this@toHTML is GroupedDataFrame) {
+            appendLine("Grouped by ${by.joinToString()} [${groups.size}]")
+        }
+        append("</p>")
+
+
         append("</body></html>")
     }.toString()
 }
diff --git a/src/test/kotlin/krangl/test/CoreVerbsTest.kt b/src/test/kotlin/krangl/test/CoreVerbsTest.kt
@@ -682,7 +682,7 @@ class CoreTests {
                 Petal.Width   [Dbl]    0.2, 0.2, 0.2, 0.2, ...
                 Species       [Str]    setosa, setosa, seto...
                 id            [Regex]  foo1, foo2, foo3, fo...
-                """.trimAndReline()
+                """.trimIndent()
     }
 
     @Test
diff --git a/src/test/kotlin/krangl/test/ReshapingTest.kt b/src/test/kotlin/krangl/test/ReshapingTest.kt
@@ -264,7 +264,7 @@ class NestingTests {
                     DataFrame with 3 observations
                     Species  [Str]        setosa, versicolor, virginica
                     data     [DataFrame]  <DataFrame [50 x 4]>, <DataFrame [50 x 4]>, <DataFrame [50 x 4]>
-                    """.trimAndReline()
+                    """.trimIndent()
             }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -682,7 +682,7 @@ class CoreTests {`
`682`	`682`	`Petal.Width [Dbl] 0.2, 0.2, 0.2, 0.2, ...`
`683`	`683`	`Species [Str] setosa, setosa, seto...`
`684`	`684`	`id [Regex] foo1, foo2, foo3, fo...`
`685`		`- """.trimAndReline()`
	`685`	`+ """.trimIndent()`
`686`	`686`	`}`
`687`	`687`
`688`	`688`	`@Test`
Original file line number	Diff line number	Diff line change
`@@ -264,7 +264,7 @@ class NestingTests {`
`264`	`264`	`DataFrame with 3 observations`
`265`	`265`	`Species [Str] setosa, versicolor, virginica`
`266`	`266`	`data [DataFrame] <DataFrame [50 x 4]>, <DataFrame [50 x 4]>, <DataFrame [50 x 4]>`
`267`		`- """.trimAndReline()`
	`267`	`+ """.trimIndent()`
`268`	`268`	`}`
`269`	`269`	`}`
`270`	`270`