diff --git a/Cargo.lock b/Cargo.lock index d84574f0..946f46a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6346,9 +6346,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.138" +version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ "indexmap", "itoa", diff --git a/docs/notebooks/base_sequence_quality.ipynb b/docs/notebooks/base_sequence_quality.ipynb new file mode 100644 index 00000000..ac49dd6f --- /dev/null +++ b/docs/notebooks/base_sequence_quality.ipynb @@ -0,0 +1,455 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "83d7ccd7", + "metadata": {}, + "source": [ + "### Import dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "58b40aa6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/user/.pyenv/versions/3.12.9/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "INFO:polars_bio:Creating BioSessionContext\n" + ] + } + ], + "source": [ + "import polars_bio as pb\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "2e29cd0a", + "metadata": {}, + "source": [ + "### Usage examples" + ] + }, + { + "cell_type": "markdown", + "id": "b238193d", + "metadata": {}, + "source": [ + "#### Usage example - .fastq file" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0420c240", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " pos avg q1 median q3 lower upper\n", + "0 47 37.665 37.535714 39.921053 41.060185 32.249008 46.346892\n", + "1 38 37.640 37.964286 40.067308 41.024038 33.374657 45.613668\n", + "2 65 35.995 35.190000 37.433333 39.810000 28.260000 46.740000\n", + "3 3 35.690 35.483333 37.208661 37.600394 32.307743 40.775984\n", + "4 41 37.870 37.678571 40.116071 41.004902 32.689076 45.994398\n", + ".. ... ... ... ... ... ... ...\n", + "96 42 37.780 37.583333 40.139344 40.954918 32.525956 46.012295\n", + "97 43 37.775 38.114583 40.126866 40.869403 33.982354 45.001632\n", + "98 15 38.725 38.226190 40.352459 41.168033 33.813427 45.580796\n", + "99 46 37.790 37.479167 39.975000 41.042453 32.134237 46.387382\n", + "100 24 38.265 38.397059 40.095745 41.125000 34.305147 45.216912\n", + "\n", + "[101 rows x 7 columns]\n" + ] + } + ], + "source": [ + "result = pb.base_sequence_quality(\"example.fastq\", output_type=\"pandas.DataFrame\", target_partitions=2)\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "9886c394", + "metadata": {}, + "source": [ + "#### Usage example - .csv file" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "66c3af24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| pos | avg | q1 | median | q3 | lower | upper |
|---|---|---|---|---|---|---|
| u64 | f64 | f64 | f64 | f64 | f64 | f64 |
| 47 | 37.665 | 37.535714 | 39.921053 | 41.060185 | 32.249008 | 46.346892 |
| 90 | 32.275 | 33.183333 | 34.986842 | 35.63961 | 29.498918 | 39.324026 |
| 26 | 37.855 | 38.028846 | 40.078947 | 40.951754 | 33.644484 | 45.336117 |
| 7 | 35.4 | 35.465909 | 36.90625 | 37.487374 | 32.433712 | 40.519571 |
| 3 | 35.69 | 35.483333 | 37.208661 | 37.600394 | 32.307743 | 40.775984 |
| … | … | … | … | … | … | … |
| 19 | 38.425 | 38.2375 | 40.238462 | 41.004902 | 34.086397 | 45.156005 |
| 94 | 30.775 | 32.057692 | 34.651515 | 35.51 | 26.879231 | 40.688462 |
| 17 | 38.505 | 38.125 | 40.127451 | 41.09375 | 33.671875 | 45.546875 |
| 79 | 32.46 | 31.96875 | 35.205357 | 36.4375 | 25.265625 | 43.140625 |
| 41 | 37.87 | 37.678571 | 40.116071 | 41.004902 | 32.689076 | 45.994398 |
| pos | avg | q1 | median | q3 | lower | upper |
|---|---|---|---|---|---|---|
| u64 | f64 | f64 | f64 | f64 | f64 | f64 |
| 44 | 37.565 | 37.71875 | 40.133929 | 41.024038 | 32.760817 | 45.981971 |
| 27 | 38.44 | 38.234375 | 40.27 | 41.207031 | 33.775391 | 45.666016 |
| 72 | 33.265 | 33.861111 | 35.6875 | 37.923913 | 27.766908 | 44.018116 |
| 86 | 31.815 | 33.365385 | 35.132911 | 35.762658 | 29.769474 | 39.358569 |
| 90 | 32.275 | 33.183333 | 34.986842 | 35.63961 | 29.498918 | 39.324026 |
| … | … | … | … | … | … | … |
| 34 | 38.205 | 37.979167 | 40.159574 | 41.168033 | 33.195867 | 45.951332 |
| 84 | 32.415 | 33.25 | 35.182432 | 35.85473 | 29.342905 | 39.761824 |
| 49 | 37.21 | 36.678571 | 39.645833 | 40.763889 | 30.550595 | 46.891865 |
| 63 | 36.25 | 35.215909 | 38.026316 | 40.275 | 27.627273 | 47.863636 |
| 80 | 32.61 | 32.625 | 35.289474 | 36.420455 | 26.931818 | 42.113636 |
| pos | avg | q1 | median | q3 | lower | upper |
|---|---|---|---|---|---|---|
| u64 | f64 | f64 | f64 | f64 | f64 | f64 |
| 64 | 36.095 | 35.064815 | 37.7 | 40.145833 | 27.443287 | 47.767361 |
| 4 | 35.68 | 35.552885 | 37.208661 | 37.600394 | 32.481621 | 40.671657 |
| 68 | 35.91 | 34.940476 | 36.931818 | 39.539474 | 28.04198 | 46.43797 |
| 72 | 33.265 | 33.861111 | 35.6875 | 37.923913 | 27.766908 | 44.018116 |
| 58 | 36.77 | 35.338235 | 38.90625 | 40.717105 | 27.26993 | 48.78541 |
| … | … | … | … | … | … | … |
| 26 | 37.855 | 38.028846 | 40.078947 | 40.951754 | 33.644484 | 45.336117 |
| 9 | 37.36 | 37.34375 | 39.014706 | 39.502451 | 34.105699 | 42.740502 |
| 71 | 33.005 | 33.791667 | 35.851351 | 38.583333 | 26.604167 | 45.770833 |
| 11 | 37.71 | 37.784483 | 39.094595 | 39.542793 | 35.147018 | 42.180258 |
| 24 | 38.265 | 38.397059 | 40.095745 | 41.125 | 34.305147 | 45.216912 |
| pos | avg | q1 | median | q3 | lower | upper |
|---|---|---|---|---|---|---|
| u64 | f64 | f64 | f64 | f64 | f64 | f64 |
| 3 | 35.69 | 35.483333 | 37.208661 | 37.600394 | 32.307743 | 40.775984 |
| 29 | 38.595 | 38.35 | 40.258333 | 41.09375 | 34.234375 | 45.209375 |
| 44 | 37.565 | 37.71875 | 40.133929 | 41.024038 | 32.760817 | 45.981971 |
| 76 | 30.265 | 30.916667 | 35.414894 | 37.073529 | 21.681373 | 46.308824 |
| 67 | 35.96 | 34.723684 | 36.972222 | 39.81 | 27.094211 | 47.439474 |
| … | … | … | … | … | … | … |
| 15 | 38.725 | 38.22619 | 40.352459 | 41.168033 | 33.813427 | 45.580796 |
| 26 | 37.855 | 38.028846 | 40.078947 | 40.951754 | 33.644484 | 45.336117 |
| 39 | 37.895 | 38.027778 | 40.125 | 40.856618 | 33.784518 | 45.099877 |
| 97 | 30.67 | 31.575 | 34.890625 | 35.557229 | 25.601657 | 41.530572 |
| 51 | 37.53 | 36.71875 | 39.447368 | 41.024038 | 30.260817 | 47.481971 |
| pos | avg | q1 | median | q3 | lower | upper |
|---|---|---|---|---|---|---|
| u64 | f64 | f64 | f64 | f64 | f64 | f64 |
| 71 | 33.005 | 33.791667 | 35.851351 | 38.583333 | 26.604167 | 45.770833 |
| 74 | 30.83 | 32.75 | 35.532609 | 37.485294 | 25.647059 | 44.588235 |
| 45 | 37.45 | 36.59375 | 40.071429 | 41.077273 | 29.868466 | 47.802557 |
| 49 | 37.21 | 36.678571 | 39.645833 | 40.763889 | 30.550595 | 46.891865 |
| 75 | 31.06 | 30.958333 | 35.418367 | 37.283333 | 21.470833 | 46.770833 |
| … | … | … | … | … | … | … |
| 50 | 37.425 | 35.975 | 39.770833 | 41.060185 | 28.347222 | 48.687963 |
| 3 | 35.69 | 35.483333 | 37.208661 | 37.600394 | 32.307743 | 40.775984 |
| 0 | 30.135 | 31.213636 | 33.722222 | 34.482143 | 26.310877 | 39.384903 |
| 37 | 38.0 | 38.215278 | 40.080357 | 40.96875 | 34.085069 | 45.098958 |
| 65 | 35.995 | 35.19 | 37.433333 | 39.81 | 28.26 | 46.74 |
| + average GC content + | ++ 46.15999984741211 + | +
|---|---|
| + average read length + | ++ 101 + | +
| + canonical + | ++ True + | +
| + file name + | ++ example.fastq + | +
| + k + | ++ 5 + | +
| + total reads + | ++ 200 + | +