From 3bf8e4ac24acdd3e82e229e46dea6b946b4f61ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 10 Jul 2025 10:48:38 +0200 Subject: [PATCH 1/7] ci --- .github/workflows/publish-to-test-pypi.yml | 74 ++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 .github/workflows/publish-to-test-pypi.yml diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml new file mode 100644 index 0000000..eac8f73 --- /dev/null +++ b/.github/workflows/publish-to-test-pypi.yml @@ -0,0 +1,74 @@ +name: Publish Python 🐍 distribution πŸ“¦ to PyPI and TestPyPI + +on: push + +jobs: + build: + name: Build distribution πŸ“¦ + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install pypa/build + run: >- + python3 -m + pip install + build + --user + - name: Build a binary wheel and a source tarball + run: python3 -m build + - name: Store the distribution packages + uses: actions/upload-artifact@v4 + with: + name: python-package-distributions + path: dist/ + publish-to-pypi: + name: >- + Publish Python 🐍 distribution πŸ“¦ to PyPI + if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes + needs: + - build + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/refinedoc + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution πŸ“¦ to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + publish-to-testpypi: + name: Publish Python 🐍 distribution πŸ“¦ to TestPyPI + needs: + - build + runs-on: ubuntu-latest + + environment: + name: testpypi + url: https://test.pypi.org/p/refinedoc + + permissions: + id-token: write # IMPORTANT: mandatory for trusted publishing + + steps: + - name: Download all the dists + uses: actions/download-artifact@v4 + with: + name: python-package-distributions + path: dist/ + - name: Publish distribution πŸ“¦ to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + repository-url: https://test.pypi.org/legacy/ \ No newline at end of file From 8fbf0af45c9ed36f49501f1346f96643e5e1aada Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 10 Jul 2025 11:05:19 +0200 Subject: [PATCH 2/7] ci --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 559f64b..a38d9d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "refinedoc" -version = "0.0.3" +version = "0.0.4" authors = [ { name="ThΓ©o NARDIN", email="theo.nardin@learningplanetinstitute.org" }, ] From 0663cba23e3410726bd980467a27ecdf756cf24e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 10 Jul 2025 11:08:57 +0200 Subject: [PATCH 3/7] add information --- README.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4f358cf..a7d7c66 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,8 @@ You can install with pip ``` pip install refinedoc ``` -### Example +### Example (vanilla) + ```python from refinedoc.refined_document import RefinedDocument @@ -61,6 +62,27 @@ body = rd.body # [["lorem ipsum dolor sit amet", "consectetur adipiscing elit"], ["sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"], ["ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat"], ["duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur"]] ``` +## Example (with pypdf) + +```python +from refinedoc.refined_document import RefinedDocument +from pypdf import PdfReader + +# Build the document from a PDF file +reader = PdfReader("path/to/your/pdf/file.pdf") +document = [] +for page in reader.pages: + document.append(page.extract_text().split("\n")) + +rd = RefinedDocument(content=document) +headers = rd.headers +# [["header 1", "subheader 1"], ["header 2", "subheader 2"], ["header 3", "subheader 3"], ["header 4", "subheader 4"]] +footers = rd.footers +# [["footer 1"], ["footer 2"], ["footer 3"], ["footer 4"]] +body = rd.body +# [["lorem ipsum dolor sit amet", "consectetur adipiscing elit"], ["sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"], ["ut enim ad minim veniam quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat"], ["duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur"]] +``` + ## How it's work My work is based on this paper : [Lin, Xiaofan. (2003). Header and Footer Extraction by Page-Association. 5010. 164-171. 10.1117/12.472833. ](https://www.researchgate.net/publication/221253782_Header_and_Footer_Extraction_by_Page-Association) From b75ab5800efd95a30a0dca26c87cce785c979e7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 10 Jul 2025 11:17:34 +0200 Subject: [PATCH 4/7] add tests --- .github/workflows/publish-to-test-pypi.yml | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml index eac8f73..b00c424 100644 --- a/.github/workflows/publish-to-test-pypi.yml +++ b/.github/workflows/publish-to-test-pypi.yml @@ -1,6 +1,7 @@ name: Publish Python 🐍 distribution πŸ“¦ to PyPI and TestPyPI -on: push +on: + push jobs: build: @@ -28,6 +29,25 @@ jobs: with: name: python-package-distributions path: dist/ + + test: + name: Run tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Run tests + env: + PYTHONPATH: ${{ github.workspace }} # Ensure the current workspace is in the PYTHONPATH + run: python3 -m unittest discover tests/ + + publish-to-pypi: name: >- Publish Python 🐍 distribution πŸ“¦ to PyPI From 8733cd78b02dd7a7218c9f40cb40c844c99f2fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 10 Jul 2025 11:19:03 +0200 Subject: [PATCH 5/7] add tests --- .github/workflows/publish-to-test-pypi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml index b00c424..2fb3269 100644 --- a/.github/workflows/publish-to-test-pypi.yml +++ b/.github/workflows/publish-to-test-pypi.yml @@ -44,7 +44,7 @@ jobs: python-version: "3.x" - name: Run tests env: - PYTHONPATH: ${{ github.workspace }} # Ensure the current workspace is in the PYTHONPATH + PYTHONPATH: src run: python3 -m unittest discover tests/ From e805fa8dc89784da654670f9826aea9e75bf98bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 10 Jul 2025 11:20:11 +0200 Subject: [PATCH 6/7] add tests into pipeline --- .github/workflows/publish-to-test-pypi.yml | 37 +++++++++++----------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/.github/workflows/publish-to-test-pypi.yml b/.github/workflows/publish-to-test-pypi.yml index 2fb3269..fb134ac 100644 --- a/.github/workflows/publish-to-test-pypi.yml +++ b/.github/workflows/publish-to-test-pypi.yml @@ -4,10 +4,28 @@ on: push jobs: + test: + name: Run tests + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Run tests + env: + PYTHONPATH: src + run: python3 -m unittest discover tests/ + build: name: Build distribution πŸ“¦ runs-on: ubuntu-latest - + needs: + - test steps: - uses: actions/checkout@v4 with: @@ -30,23 +48,6 @@ jobs: name: python-package-distributions path: dist/ - test: - name: Run tests - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v4 - with: - persist-credentials: false - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.x" - - name: Run tests - env: - PYTHONPATH: src - run: python3 -m unittest discover tests/ - publish-to-pypi: name: >- From 07103bd3e58798ed44449e59b48d2f0802b54518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o?= Date: Thu, 10 Jul 2025 11:21:58 +0200 Subject: [PATCH 7/7] publish 1st main version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a38d9d3..210fba6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "refinedoc" -version = "0.0.4" +version = "1.0.0" authors = [ { name="ThΓ©o NARDIN", email="theo.nardin@learningplanetinstitute.org" }, ]