Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
7316257
Checks abbreviations press and reports
LapshinAE0 Nov 21, 2025
c89d506
Fixed if present check
LapshinAE0 Nov 21, 2025
8394b5a
abbr taken out
LapshinAE0 Dec 1, 2025
975b4d2
correct check abbr
LapshinAE0 Dec 1, 2025
280d9cd
fixed checks 1
LapshinAE0 Dec 4, 2025
7fe11be
fixed checks 2
LapshinAE0 Dec 4, 2025
edd07ce
fixed checks 3
LapshinAE0 Dec 4, 2025
d696932
Merge branch 'master' into 555_check_abbreviations
LapshinAE0 Dec 4, 2025
ab8f2f7
Update banned_words_in_literature.py
HadronCollider Dec 5, 2025
331a121
fix svg size and background color
HadronCollider Nov 20, 2025
ac65e06
fix dev docker compose
HadronCollider Nov 20, 2025
c69548c
print traceback to logs (and check result)
HadronCollider Nov 20, 2025
bbbf47d
little change for svg size
HadronCollider Nov 20, 2025
31371af
add more feedback for UNEXPECTED_CHECK_FAIL_MSG
HadronCollider Nov 20, 2025
df61abd
little kostil'
HadronCollider Nov 20, 2025
c90d82d
update template results.html
HadronCollider Nov 28, 2025
3269e53
569: fix big files in webpack
necit-dev Jul 7, 2024
bdfefdf
update some html-templates and styles
HadronCollider Nov 30, 2025
eece7ea
update Dockerfiles (base and main), requirements and some python-libs
HadronCollider Nov 30, 2025
8280103
update main_character_check
HadronCollider Dec 8, 2025
c88f47e
add recheck test
HadronCollider Dec 8, 2025
495b68d
Update style_check_settings.py
HadronCollider Dec 10, 2025
dcad677
add warned_words for banned_words_check
HadronCollider Dec 17, 2025
8b5865e
add login_required and author check for result page
HadronCollider Dec 26, 2025
02b70ba
little update for 404 page
HadronCollider Dec 26, 2025
9a54360
all requier applyied-2
LapshinAE0 Jan 26, 2026
96cb746
all requier applyied-3
LapshinAE0 Jan 26, 2026
d2d09a6
Merge branch 'dev' into 555_check_abbreviations
LapshinAE0 Jan 29, 2026
d26fcd3
add case for title page
LapshinAE0 Feb 7, 2026
fd19063
dont check abbr title page
LapshinAE0 Feb 10, 2026
467bef5
Merge branch 'dev' into 555_check_abbreviations
HadronCollider Feb 15, 2026
f1a8f16
Merge branch 'dev' into 555_check_abbreviations
HadronCollider Feb 15, 2026
a224c89
fix page num
LapshinAE0 Mar 13, 2026
30689a7
corrected check and added log for debug
LapshinAE0 Mar 27, 2026
a777fb4
Merge branch 'dev' into 555_check_abbreviations
LapshinAE0 Mar 27, 2026
5fcbfb4
remove description
LapshinAE0 Apr 10, 2026
8ee7e11
Merge branch 'dev' into 555_check_abbreviations
LapshinAE0 Apr 14, 2026
cdcd6a0
Merge branch 'dev' into 555_check_abbreviations
HadronCollider Apr 19, 2026
23737ff
format code
HadronCollider Apr 19, 2026
bcc5f5b
ReportAbbreviationsCheck and PresAbbreviationsCheck to warning check
HadronCollider Apr 19, 2026
fb8be40
feat: added config-json abbr
LapshinAE0 Apr 24, 2026
038c6d7
fix: fixed load abbr
LapshinAE0 May 3, 2026
b4ebea2
Merge branch 'dev' of https://github.com/moevm/document_insight_syste…
LapshinAE0 May 22, 2026
a1a7d44
fix: linter
LapshinAE0 May 22, 2026
bd30425
fix: linter 2
LapshinAE0 May 22, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions app/configs/config_abbreviations.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"common_abbr": [
"СССР", "РФ", "США", "ВКР", "ИТ", "ПО", "ООО", "ЗАО", "ОАО", "HTML", "CSS",
"JS", "ЛЭТИ", "МОЕВМ", "ЭВМ", "ГОСТ", "DVD", "ИИ", "ОБЗОР",
"ООП", "ЛР", "КР", "ОТЧЕТ", "ПЛАН", "СЛОВА", "ЦПУ", "МБ", "ОЗУ", "КБ",
"SSD", "PC", "HDD",
"AX", "BX", "CX", "DX", "SI", "DI", "BP", "SP",
"AH", "AL", "BH", "BL", "CH", "CL", "DH", "DL",
"CS", "DS", "ES", "SS", "FS", "GS",
"IP", "EIP", "RIP", "URL",
"CF", "PF", "AF", "ZF", "SF", "TF", "IF", "DF", "OF",
"EAX", "EBX", "ECX", "EDX", "ESI", "EDI", "EBP", "ESP",
"RAX", "RBX", "RCX", "RDX", "RSI", "RDI", "RBP", "RSP",
"DOS", "OS", "BIOS", "UEFI", "MBR", "GPT",
"ASCII", "UTF", "UNICODE", "ANSI",
"ЭВМ", "МОЭВМ",
"CPU", "GPU", "APU", "RAM", "ROM", "PROM", "EPROM", "EEPROM",
"USB", "SATA", "PCI", "PCIe", "AGP", "ISA", "VGA", "HDMI", "DP",
"LAN", "WAN", "WLAN", "VPN", "ISP", "DNS", "DHCP", "TCP", "UDP", "IP",
"HTTP", "HTTPS", "FTP", "SSH", "SSL", "TLS", "XP", "ELF", "ACM", "IEEE", "UX",
"API", "GUI", "CLI", "IDE", "SDK", "SQL", "NoSQL", "XML", "JSON", "YAML",
"MAC", "IBM", "CERF", "LTR", "RTL", "FPS", "SHA", "AR", "EN", "RU",
"CREAT", "FIFO", "RSS", "UML", "UI", "GB", "IJGBL"
]
}
2 changes: 2 additions & 0 deletions app/main/check_packs/pack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
['pres_image_capture'],
['task_tracker'],
['overview_in_tasks'],
['pres_abbreviations_check'],
['pres_aspect_ratio_check'],
['pres_was_were_check'],
]
Expand Down Expand Up @@ -53,6 +54,7 @@
["empty_task_page_check"],
["water_in_the_text_check"],
["report_task_tracker"],
["report_abbreviations_check"],
["report_was_were_check"],
]

Expand Down
137 changes: 137 additions & 0 deletions app/main/checks/check_abbreviations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import json
import re
from pathlib import Path

from pymorphy3 import MorphAnalyzer

morph = MorphAnalyzer()

DEBUG_MODE = False


def load_abbreviations():
config_path = Path(__file__).parent.parent.parent / "configs" / "config_abbreviations.json"
with open(config_path, "r", encoding="utf-8") as f:
data = json.load(f)
return set(data.get("common_abbr"))


COMMON_ABBR = load_abbreviations()


def debug_print(*args, **kwargs):
if DEBUG_MODE:
print(*args, **kwargs)


def get_first_letters(phrase):
if not phrase:
return ""
words = phrase.split()
return "".join(word[0].upper() for word in words if word)


def is_abbreviation_explained(abbr: str, text: str) -> bool:
patterns = [
rf"{abbr}\s*\(([^)]+)\)", # АААА (расшифровка)
rf"\(([^)]+)\)\s*{abbr}", # (расшифровка) АААА
rf"{abbr}\s*[—\-]\s*([^.,;!?]+)", # АААА — расшифровка
rf"{abbr}\s*-\s*([^.,;!?]+)", # АААА - расшифровка
rf"([^.,;!?]+)\s*[—\-]\s*{abbr}", # расшифровка — АААА
rf"([^.,;!?]+)\s*-\s*{abbr}", # расшифровка - АААА
]

debug_print(f"Проверка аббревиатуры: {abbr}")
debug_print(f"Текст (первые 200 символов): {text[:200]}...")

for pattern in patterns:
match = re.search(pattern, text, re.IGNORECASE)

if match:
explanation = match.group(1)
debug_print(f" Найден паттерн {pattern}")
debug_print(f" Расшифровка: {explanation}")

if correctly_explained(abbr, explanation):
debug_print(" Расшифровка корректна")
return True
else:
debug_print(" Расшифровка НЕ соответствует первым буквам")
debug_print(f" Ожидалось: {abbr.upper()}")
debug_print(f" Получено: {get_first_letters(explanation)}")

debug_print(f" Расшифровка для {abbr} не найдена")
return False


def get_unexplained_abbrev(text, unverifiable_text):
abbreviations = find_abbreviations(text, unverifiable_text)

if not abbreviations:
return False, []

unexplained_abbr = []
for abbr in abbreviations:
if not is_abbreviation_explained(abbr, text):
unexplained_abbr.append(abbr)

return True, unexplained_abbr


def find_abbreviations(text: str, unverifiable_text: str):
pattern = r"\b[А-ЯA-Z]{2,5}\b"
abbreviations = re.findall(pattern, text)

filtered_abbr = {
abbr
for abbr in abbreviations
if abbr not in COMMON_ABBR and abbr not in unverifiable_text and morph.parse(abbr.lower())[0].score != 0
}

return list(filtered_abbr)


def correctly_explained(abbr, explan):
words = explan.split()

first_letters = ""
for word in words:
if word:
first_letters += word[0].upper()

return first_letters == abbr.upper()


def main_check(text: str, unverifiable_text: str):
try:
debug_print(f"unverifiable_text : {unverifiable_text}")
continue_check = True
res_str = ""
if not text:
continue_check, res_str = False, "Не удалось получить текст"

abbr_is_finding, unexplained_abbr = get_unexplained_abbrev(text=text, unverifiable_text=unverifiable_text)

if not abbr_is_finding:
continue_check, res_str = (
False,
"Аббревиатуры не найдены в представленном документе",
)

if not unexplained_abbr:
continue_check, res_str = False, "Все аббревиатуры правильно расшифрованы"

return continue_check, res_str, unexplained_abbr

except Exception as e:
return False, f"Ошибка при проверке аббревиатур: {str(e)}", {}


def forming_response(unexplained_abbr_with_page, format_page_link):
result_str = "Найдены нерасшифрованные аббревиатуры при первом использовании:<br>"
page_links = format_page_link(list(unexplained_abbr_with_page.values()))
for index_links, abbr in enumerate(unexplained_abbr_with_page):
result_str += f"- {abbr} на {page_links[index_links]} странице<br>"
result_str += "Каждая аббревиатура должна быть расшифрована при первом использовании в тексте.<br>"
result_str += "Расшифровка должны быть по первыми буквам, например, МВД - Министерство внутренних дел.<br>"
return result_str
37 changes: 37 additions & 0 deletions app/main/checks/presentation_checks/abbreviations_presentation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from ..base_check import BasePresCriterion, answer
from ..check_abbreviations import forming_response, main_check


class PresAbbreviationsCheck(BasePresCriterion):
_description = "Аббревиатуры в тексте должны быть расшифрованы при первом использовании."
id = "pres_abbreviations_check"
warning = True

def __init__(self, file_info):
super().__init__(file_info)

def check(self):
try:
slides_text = self.file.get_text_from_slides()
title_page = slides_text[0]
full_text = " ".join(slides_text)

continue_check, res_str, unexplained_abbr = main_check(text=full_text, unverifiable_text=title_page)
if not continue_check:
return answer(True, res_str)

unexplained_abbr_with_slides = {}

for slide_num, slide_text in enumerate(slides_text, 0):
for abbr in unexplained_abbr:
if abbr in slide_text and abbr not in unexplained_abbr_with_slides:
unexplained_abbr_with_slides[abbr] = slide_num

if not unexplained_abbr_with_slides:
return answer(True, "Все аббревиатуры правильно расшифрованы")

result_str = forming_response(unexplained_abbr_with_slides, lambda pages: self.format_page_link(pages))
return answer(False, result_str)

except Exception as e:
return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}")
89 changes: 89 additions & 0 deletions app/main/checks/report_checks/abbreviations_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from ..base_check import BaseReportCriterion, answer
from ..check_abbreviations import forming_response, main_check


class ReportAbbreviationsCheck(BaseReportCriterion):
label = "Проверка расшифровки аббревиатур"
_description = "Аббревиатуры в тексте должны быть расшифрованы при первом использовании."
id = "report_abbreviations_check"
warning = True

def __init__(self, file_info):
super().__init__(file_info)

def check(self):
try:
text = self._get_document_text()

headings = [
"СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ",
"ПРИЛОЖЕНИЕ",
"ОПРЕДЕЛЕНИЯ, ОБОЗНАЧЕНИЯ И СОКРАЩЕНИЯ",
]
unverifiable_text = self._get_unverifiable_text(headings)

continue_check, res_str, unexplained_abbr = main_check(text=text, unverifiable_text=unverifiable_text)
if not continue_check:
return answer(True, res_str)

unexplained_abbr_with_page = {}

for page_num in range(1, self.file.page_counter() + 1):
text_on_page = self.file.pdf_file.text_on_page[page_num]

for abbr in unexplained_abbr:
if abbr in text_on_page and abbr not in unexplained_abbr_with_page:
unexplained_abbr_with_page[abbr] = page_num

if not unexplained_abbr_with_page:
return answer(True, "Все аббревиатуры правильно расшифрованы")
result_str = forming_response(unexplained_abbr_with_page, lambda pages: self.format_page_link(pages))
return answer(False, result_str)
Comment thread
HadronCollider marked this conversation as resolved.

except Exception as e:
return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}")

def _get_document_text(self):

if hasattr(self.file, "pdf_file"):
page_texts = self.file.pdf_file.get_text_on_page()
return " ".join(page_texts.values())
elif hasattr(self.file, "paragraphs"):
text_parts = []
for paragraph in self.file.paragraphs:
text = paragraph.to_string()
if "\n" in text:
text = text.split("\n")[1]
text_parts.append(text)
return "\n".join(text_parts)
return None

def _get_text_into_sections(self, headings):
chapters = self.file.make_chapters(self.file_type["report_type"])
text_parts = []

for chapter in chapters:
chapter_title = chapter.get("text", "").upper()

if any(stop.upper() in chapter_title for stop in headings):
text_parts.append(chapter["text"])

def add_child_text(child_elements):
for child in child_elements:
if child.get("text"):
text_parts.append(child["text"])
if child.get("child"):
add_child_text(child["child"])

if chapter.get("child"):
add_child_text(chapter["child"])

return " ".join(text_parts)

def _get_text_title_page(self):
title_page = self.file.pdf_file.text_on_page[1]
return title_page

def _get_unverifiable_text(self, unverifiable_headings):
unverifiable_text = self._get_text_title_page() + self._get_text_into_sections(unverifiable_headings)
return unverifiable_text
2 changes: 1 addition & 1 deletion app/main/checks/report_checks/literature_references.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def search_references(self, start_par):
match = re.search(r'Таблица ([.\d]+)', paragraph_text)
table_text = ''
if match:
index_table += 1 # int(match.group(1)) - 1 # TODO: fix logic
index_table += 1 # int(match.group(1)) - 1 # TODO: fix logic
table_text = self.get_text_in_table(index_table)

paragraph_text += table_text
Expand Down
4 changes: 3 additions & 1 deletion app/main/checks/report_checks/main_page_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ class ReportMainPageSetting:
"found_value": 0,
"found_key": 0,
"find": 3,
"value": [r"(Руководитель).*([кд]\..+\.н\., (доцент|профессор))[|]*([А-Я](?:\.-?[А-Я])?\.[А-Я]\. [А-Я][а-я]+)"], #
"value": [
r"(Руководитель).*([кд]\..+\.н\., (доцент|профессор))[|]*([А-Я](?:\.-?[А-Я])?\.[А-Я]\. [А-Я][а-я]+)"
], #
"logs": "",
},
{
Expand Down
13 changes: 8 additions & 5 deletions app/utils/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@ def run_process(cmd: str):

def convert_to(filepath, target_format='pdf'):
new_filename, outdir = None, dirname(filepath)
convert_cmd = "timeout 3m " + {
'pdf': f"soffice --headless --convert-to pdf --outdir {outdir} {filepath}",
'docx': f"soffice --headless --convert-to docx --outdir {outdir} {filepath}",
'pptx': f"soffice --headless --convert-to pptx --outdir {outdir} {filepath}",
}[target_format]
convert_cmd = (
"timeout 3m "
+ {
'pdf': f"soffice --headless --convert-to pdf --outdir {outdir} {filepath}",
'docx': f"soffice --headless --convert-to docx --outdir {outdir} {filepath}",
'pptx': f"soffice --headless --convert-to pptx --outdir {outdir} {filepath}",
}[target_format]
)

if run_process(convert_cmd).returncode == 0:
# success conversion
Expand Down
Loading