diff --git a/app/configs/config_abbreviations.json b/app/configs/config_abbreviations.json
new file mode 100644
index 00000000..81f7e306
--- /dev/null
+++ b/app/configs/config_abbreviations.json
@@ -0,0 +1,25 @@
+{
+ "common_abbr": [
+ "СССР", "РФ", "США", "ВКР", "ИТ", "ПО", "ООО", "ЗАО", "ОАО", "HTML", "CSS",
+ "JS", "ЛЭТИ", "МОЕВМ", "ЭВМ", "ГОСТ", "DVD", "ИИ", "ОБЗОР",
+ "ООП", "ЛР", "КР", "ОТЧЕТ", "ПЛАН", "СЛОВА", "ЦПУ", "МБ", "ОЗУ", "КБ",
+ "SSD", "PC", "HDD",
+ "AX", "BX", "CX", "DX", "SI", "DI", "BP", "SP",
+ "AH", "AL", "BH", "BL", "CH", "CL", "DH", "DL",
+ "CS", "DS", "ES", "SS", "FS", "GS",
+ "IP", "EIP", "RIP", "URL",
+ "CF", "PF", "AF", "ZF", "SF", "TF", "IF", "DF", "OF",
+ "EAX", "EBX", "ECX", "EDX", "ESI", "EDI", "EBP", "ESP",
+ "RAX", "RBX", "RCX", "RDX", "RSI", "RDI", "RBP", "RSP",
+ "DOS", "OS", "BIOS", "UEFI", "MBR", "GPT",
+ "ASCII", "UTF", "UNICODE", "ANSI",
+ "ЭВМ", "МОЭВМ",
+ "CPU", "GPU", "APU", "RAM", "ROM", "PROM", "EPROM", "EEPROM",
+ "USB", "SATA", "PCI", "PCIe", "AGP", "ISA", "VGA", "HDMI", "DP",
+ "LAN", "WAN", "WLAN", "VPN", "ISP", "DNS", "DHCP", "TCP", "UDP", "IP",
+ "HTTP", "HTTPS", "FTP", "SSH", "SSL", "TLS", "XP", "ELF", "ACM", "IEEE", "UX",
+ "API", "GUI", "CLI", "IDE", "SDK", "SQL", "NoSQL", "XML", "JSON", "YAML",
+ "MAC", "IBM", "CERF", "LTR", "RTL", "FPS", "SHA", "AR", "EN", "RU",
+ "CREAT", "FIFO", "RSS", "UML", "UI", "GB", "IJGBL"
+ ]
+}
\ No newline at end of file
diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py
index d31bc3c1..696b9188 100644
--- a/app/main/check_packs/pack_config.py
+++ b/app/main/check_packs/pack_config.py
@@ -22,6 +22,7 @@
['pres_image_capture'],
['task_tracker'],
['overview_in_tasks'],
+ ['pres_abbreviations_check'],
['pres_aspect_ratio_check'],
['pres_was_were_check'],
]
@@ -53,6 +54,7 @@
["empty_task_page_check"],
["water_in_the_text_check"],
["report_task_tracker"],
+ ["report_abbreviations_check"],
["report_was_were_check"],
]
diff --git a/app/main/checks/check_abbreviations.py b/app/main/checks/check_abbreviations.py
new file mode 100644
index 00000000..caa6d117
--- /dev/null
+++ b/app/main/checks/check_abbreviations.py
@@ -0,0 +1,137 @@
+import json
+import re
+from pathlib import Path
+
+from pymorphy3 import MorphAnalyzer
+
+morph = MorphAnalyzer()
+
+DEBUG_MODE = False
+
+
+def load_abbreviations():
+ config_path = Path(__file__).parent.parent.parent / "configs" / "config_abbreviations.json"
+ with open(config_path, "r", encoding="utf-8") as f:
+ data = json.load(f)
+ return set(data.get("common_abbr"))
+
+
+COMMON_ABBR = load_abbreviations()
+
+
+def debug_print(*args, **kwargs):
+ if DEBUG_MODE:
+ print(*args, **kwargs)
+
+
+def get_first_letters(phrase):
+ if not phrase:
+ return ""
+ words = phrase.split()
+ return "".join(word[0].upper() for word in words if word)
+
+
+def is_abbreviation_explained(abbr: str, text: str) -> bool:
+ patterns = [
+ rf"{abbr}\s*\(([^)]+)\)", # АААА (расшифровка)
+ rf"\(([^)]+)\)\s*{abbr}", # (расшифровка) АААА
+ rf"{abbr}\s*[—\-]\s*([^.,;!?]+)", # АААА — расшифровка
+ rf"{abbr}\s*-\s*([^.,;!?]+)", # АААА - расшифровка
+ rf"([^.,;!?]+)\s*[—\-]\s*{abbr}", # расшифровка — АААА
+ rf"([^.,;!?]+)\s*-\s*{abbr}", # расшифровка - АААА
+ ]
+
+ debug_print(f"Проверка аббревиатуры: {abbr}")
+ debug_print(f"Текст (первые 200 символов): {text[:200]}...")
+
+ for pattern in patterns:
+ match = re.search(pattern, text, re.IGNORECASE)
+
+ if match:
+ explanation = match.group(1)
+ debug_print(f" Найден паттерн {pattern}")
+ debug_print(f" Расшифровка: {explanation}")
+
+ if correctly_explained(abbr, explanation):
+ debug_print(" Расшифровка корректна")
+ return True
+ else:
+ debug_print(" Расшифровка НЕ соответствует первым буквам")
+ debug_print(f" Ожидалось: {abbr.upper()}")
+ debug_print(f" Получено: {get_first_letters(explanation)}")
+
+ debug_print(f" Расшифровка для {abbr} не найдена")
+ return False
+
+
+def get_unexplained_abbrev(text, unverifiable_text):
+ abbreviations = find_abbreviations(text, unverifiable_text)
+
+ if not abbreviations:
+ return False, []
+
+ unexplained_abbr = []
+ for abbr in abbreviations:
+ if not is_abbreviation_explained(abbr, text):
+ unexplained_abbr.append(abbr)
+
+ return True, unexplained_abbr
+
+
+def find_abbreviations(text: str, unverifiable_text: str):
+ pattern = r"\b[А-ЯA-Z]{2,5}\b"
+ abbreviations = re.findall(pattern, text)
+
+ filtered_abbr = {
+ abbr
+ for abbr in abbreviations
+ if abbr not in COMMON_ABBR and abbr not in unverifiable_text and morph.parse(abbr.lower())[0].score != 0
+ }
+
+ return list(filtered_abbr)
+
+
+def correctly_explained(abbr, explan):
+ words = explan.split()
+
+ first_letters = ""
+ for word in words:
+ if word:
+ first_letters += word[0].upper()
+
+ return first_letters == abbr.upper()
+
+
+def main_check(text: str, unverifiable_text: str):
+ try:
+ debug_print(f"unverifiable_text : {unverifiable_text}")
+ continue_check = True
+ res_str = ""
+ if not text:
+ continue_check, res_str = False, "Не удалось получить текст"
+
+ abbr_is_finding, unexplained_abbr = get_unexplained_abbrev(text=text, unverifiable_text=unverifiable_text)
+
+ if not abbr_is_finding:
+ continue_check, res_str = (
+ False,
+ "Аббревиатуры не найдены в представленном документе",
+ )
+
+ if not unexplained_abbr:
+ continue_check, res_str = False, "Все аббревиатуры правильно расшифрованы"
+
+ return continue_check, res_str, unexplained_abbr
+
+ except Exception as e:
+ return False, f"Ошибка при проверке аббревиатур: {str(e)}", {}
+
+
+def forming_response(unexplained_abbr_with_page, format_page_link):
+ result_str = "Найдены нерасшифрованные аббревиатуры при первом использовании:
"
+ page_links = format_page_link(list(unexplained_abbr_with_page.values()))
+ for index_links, abbr in enumerate(unexplained_abbr_with_page):
+ result_str += f"- {abbr} на {page_links[index_links]} странице
"
+ result_str += "Каждая аббревиатура должна быть расшифрована при первом использовании в тексте.
"
+ result_str += "Расшифровка должны быть по первыми буквам, например, МВД - Министерство внутренних дел.
"
+ return result_str
diff --git a/app/main/checks/presentation_checks/abbreviations_presentation.py b/app/main/checks/presentation_checks/abbreviations_presentation.py
new file mode 100644
index 00000000..19870a8b
--- /dev/null
+++ b/app/main/checks/presentation_checks/abbreviations_presentation.py
@@ -0,0 +1,37 @@
+from ..base_check import BasePresCriterion, answer
+from ..check_abbreviations import forming_response, main_check
+
+
+class PresAbbreviationsCheck(BasePresCriterion):
+ _description = "Аббревиатуры в тексте должны быть расшифрованы при первом использовании."
+ id = "pres_abbreviations_check"
+ warning = True
+
+ def __init__(self, file_info):
+ super().__init__(file_info)
+
+ def check(self):
+ try:
+ slides_text = self.file.get_text_from_slides()
+ title_page = slides_text[0]
+ full_text = " ".join(slides_text)
+
+ continue_check, res_str, unexplained_abbr = main_check(text=full_text, unverifiable_text=title_page)
+ if not continue_check:
+ return answer(True, res_str)
+
+ unexplained_abbr_with_slides = {}
+
+ for slide_num, slide_text in enumerate(slides_text, 0):
+ for abbr in unexplained_abbr:
+ if abbr in slide_text and abbr not in unexplained_abbr_with_slides:
+ unexplained_abbr_with_slides[abbr] = slide_num
+
+ if not unexplained_abbr_with_slides:
+ return answer(True, "Все аббревиатуры правильно расшифрованы")
+
+ result_str = forming_response(unexplained_abbr_with_slides, lambda pages: self.format_page_link(pages))
+ return answer(False, result_str)
+
+ except Exception as e:
+ return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}")
diff --git a/app/main/checks/report_checks/abbreviations_check.py b/app/main/checks/report_checks/abbreviations_check.py
new file mode 100644
index 00000000..eb63bcb0
--- /dev/null
+++ b/app/main/checks/report_checks/abbreviations_check.py
@@ -0,0 +1,89 @@
+from ..base_check import BaseReportCriterion, answer
+from ..check_abbreviations import forming_response, main_check
+
+
+class ReportAbbreviationsCheck(BaseReportCriterion):
+ label = "Проверка расшифровки аббревиатур"
+ _description = "Аббревиатуры в тексте должны быть расшифрованы при первом использовании."
+ id = "report_abbreviations_check"
+ warning = True
+
+ def __init__(self, file_info):
+ super().__init__(file_info)
+
+ def check(self):
+ try:
+ text = self._get_document_text()
+
+ headings = [
+ "СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ",
+ "ПРИЛОЖЕНИЕ",
+ "ОПРЕДЕЛЕНИЯ, ОБОЗНАЧЕНИЯ И СОКРАЩЕНИЯ",
+ ]
+ unverifiable_text = self._get_unverifiable_text(headings)
+
+ continue_check, res_str, unexplained_abbr = main_check(text=text, unverifiable_text=unverifiable_text)
+ if not continue_check:
+ return answer(True, res_str)
+
+ unexplained_abbr_with_page = {}
+
+ for page_num in range(1, self.file.page_counter() + 1):
+ text_on_page = self.file.pdf_file.text_on_page[page_num]
+
+ for abbr in unexplained_abbr:
+ if abbr in text_on_page and abbr not in unexplained_abbr_with_page:
+ unexplained_abbr_with_page[abbr] = page_num
+
+ if not unexplained_abbr_with_page:
+ return answer(True, "Все аббревиатуры правильно расшифрованы")
+ result_str = forming_response(unexplained_abbr_with_page, lambda pages: self.format_page_link(pages))
+ return answer(False, result_str)
+
+ except Exception as e:
+ return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}")
+
+ def _get_document_text(self):
+
+ if hasattr(self.file, "pdf_file"):
+ page_texts = self.file.pdf_file.get_text_on_page()
+ return " ".join(page_texts.values())
+ elif hasattr(self.file, "paragraphs"):
+ text_parts = []
+ for paragraph in self.file.paragraphs:
+ text = paragraph.to_string()
+ if "\n" in text:
+ text = text.split("\n")[1]
+ text_parts.append(text)
+ return "\n".join(text_parts)
+ return None
+
+ def _get_text_into_sections(self, headings):
+ chapters = self.file.make_chapters(self.file_type["report_type"])
+ text_parts = []
+
+ for chapter in chapters:
+ chapter_title = chapter.get("text", "").upper()
+
+ if any(stop.upper() in chapter_title for stop in headings):
+ text_parts.append(chapter["text"])
+
+ def add_child_text(child_elements):
+ for child in child_elements:
+ if child.get("text"):
+ text_parts.append(child["text"])
+ if child.get("child"):
+ add_child_text(child["child"])
+
+ if chapter.get("child"):
+ add_child_text(chapter["child"])
+
+ return " ".join(text_parts)
+
+ def _get_text_title_page(self):
+ title_page = self.file.pdf_file.text_on_page[1]
+ return title_page
+
+ def _get_unverifiable_text(self, unverifiable_headings):
+ unverifiable_text = self._get_text_title_page() + self._get_text_into_sections(unverifiable_headings)
+ return unverifiable_text
diff --git a/app/main/checks/report_checks/literature_references.py b/app/main/checks/report_checks/literature_references.py
index ba550d3d..8d95bd59 100644
--- a/app/main/checks/report_checks/literature_references.py
+++ b/app/main/checks/report_checks/literature_references.py
@@ -156,7 +156,7 @@ def search_references(self, start_par):
match = re.search(r'Таблица ([.\d]+)', paragraph_text)
table_text = ''
if match:
- index_table += 1 # int(match.group(1)) - 1 # TODO: fix logic
+ index_table += 1 # int(match.group(1)) - 1 # TODO: fix logic
table_text = self.get_text_in_table(index_table)
paragraph_text += table_text
diff --git a/app/main/checks/report_checks/main_page_settings.py b/app/main/checks/report_checks/main_page_settings.py
index e0ed9f06..bd50dab0 100644
--- a/app/main/checks/report_checks/main_page_settings.py
+++ b/app/main/checks/report_checks/main_page_settings.py
@@ -60,7 +60,9 @@ class ReportMainPageSetting:
"found_value": 0,
"found_key": 0,
"find": 3,
- "value": [r"(Руководитель).*([кд]\..+\.н\., (доцент|профессор))[|]*([А-Я](?:\.-?[А-Я])?\.[А-Я]\. [А-Я][а-я]+)"], #
+ "value": [
+ r"(Руководитель).*([кд]\..+\.н\., (доцент|профессор))[|]*([А-Я](?:\.-?[А-Я])?\.[А-Я]\. [А-Я][а-я]+)"
+ ], #
"logs": "",
},
{
diff --git a/app/utils/converter.py b/app/utils/converter.py
index 821cc976..2c051eba 100644
--- a/app/utils/converter.py
+++ b/app/utils/converter.py
@@ -9,11 +9,14 @@ def run_process(cmd: str):
def convert_to(filepath, target_format='pdf'):
new_filename, outdir = None, dirname(filepath)
- convert_cmd = "timeout 3m " + {
- 'pdf': f"soffice --headless --convert-to pdf --outdir {outdir} {filepath}",
- 'docx': f"soffice --headless --convert-to docx --outdir {outdir} {filepath}",
- 'pptx': f"soffice --headless --convert-to pptx --outdir {outdir} {filepath}",
- }[target_format]
+ convert_cmd = (
+ "timeout 3m "
+ + {
+ 'pdf': f"soffice --headless --convert-to pdf --outdir {outdir} {filepath}",
+ 'docx': f"soffice --headless --convert-to docx --outdir {outdir} {filepath}",
+ 'pptx': f"soffice --headless --convert-to pptx --outdir {outdir} {filepath}",
+ }[target_format]
+ )
if run_process(convert_cmd).returncode == 0:
# success conversion