diff --git a/Dockerfile_worker_base b/Dockerfile_worker_base index 725dc1de..05e7da65 100644 --- a/Dockerfile_worker_base +++ b/Dockerfile_worker_base @@ -9,4 +9,5 @@ ENV LANG=en_US.UTF-8 ENV TZ=Europe/Moscow RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone -RUN apt update && apt install -y g++ gcc libreoffice-writer libreoffice-impress default-jre libreoffice-java-common + +RUN apt update && apt install -y g++ gcc libreoffice-writer libreoffice-impress default-jre libreoffice-java-common tesseract-ocr tesseract-ocr-rus diff --git a/app/db/db_methods.py b/app/db/db_methods.py index 2dbf04a5..c220544a 100644 --- a/app/db/db_methods.py +++ b/app/db/db_methods.py @@ -1,13 +1,14 @@ from datetime import datetime from os.path import basename +import hashlib import pymongo from bson import ObjectId from gridfs import GridFSBucket, NoFile, errors as gridfs_errors from pymongo import MongoClient, errors as pymongo_errors from utils import convert_to -from .db_types import User, Presentation, Check, Consumers, Logs +from .db_types import User, Presentation, Check, Consumers, Logs, Image client = MongoClient("mongodb://mongodb:27017") db = client['dis-db'] @@ -18,14 +19,61 @@ checks_collection = db['checks'] consumers_collection = db['consumers'] criteria_pack_collection = db['criteria_pack'] +parsed_texts_collection = db['parsed_texts'] logs_collection = db.create_collection( 'logs', capped=True, size=5242880) if not db['logs'] else db['logs'] celery_check_collection = db['celery_check'] # collection for mapping celery_task to check +celery_tesseract_collection = db['celery_tesseract'] +images_collection = db['images'] # коллекция для хранения изображений def get_client(): return client +def get_image(image_id): + image = images_collection.find_one({'_id': image_id}) + if image is not None: + return Image(image) + else: + return None + +def get_images_by_check_id(check_id): + images = images_collection.find({'check_id': str(check_id)}) + if images is not None: + image_list = [] + for img in images: + image_list.append(Image(img)) + return image_list + else: + return None + +def save_image_to_db(check_id, image_data, caption, image_size, document_id=None, text=None, page=None, checksum=None, text_density=None, symbols_percentage=None): + image = Image({ + 'check_id': check_id, + 'document_id': document_id, + 'image_data': image_data, + 'caption': caption, + 'image_size': image_size, + 'text': text, + 'page': page, + 'checksum': checksum or calculate_image_checksum(image_data), + 'text_density': text_density, + 'symbols_percentage': symbols_percentage + }) + result = images_collection.insert_one(image.pack()) + return result.inserted_id + +def update_image(image): + return bool(images_collection.find_one_and_replace({'_id': image._id}, image.pack())) + +def calculate_image_checksum(image_bytes): + return hashlib.sha256(image_bytes).hexdigest() if image_bytes else None + +def is_checksum_in_db(checksum): + if not checksum: + return False + existing = images_collection.find_one({"checksum": checksum}) + return existing is not None # Returns user if user was created and None if already exists def add_user(username, password_hash='', is_LTI=False): @@ -145,6 +193,12 @@ def add_check(file_id, check): def update_check(check): return bool(checks_collection.find_one_and_replace({'_id': check._id}, check.pack())) +def add_parsed_text(check_id, parsed_text): + result = parsed_texts_collection.update_one({'filename': parsed_text.filename}, {'$set': parsed_text.pack()}, upsert=True) + if result.upserted_id: parsed_texts_id = result.upserted_id + else: parsed_texts_id = parsed_texts_collection.find_one({'filename': parsed_text.filename})['_id'] + files_info_collection.update_one({'_id': check_id}, {"$push": {'parsed_texts': parsed_texts_id}}) + return parsed_texts_id def get_pdf_id(file_id=None): if not file_id: file_id = ObjectId() @@ -462,3 +516,40 @@ def get_celery_task(celery_task_id): def get_celery_task_by_check(check_id): return celery_check_collection.find_one({'check_id': check_id}) + + +def get_celery_task_status_by_check(check_id): + celery_task = get_celery_task_by_check(check_id) + if celery_task and 'finished_at' in celery_task: + return True + return False + + +def add_celery_tesseract_task(celery_tesseract_task_id, check_id): + return celery_tesseract_collection.insert_one( + {'celery_tesseract_task_id': celery_tesseract_task_id, 'check_id': check_id, 'started_at': datetime.now()}).inserted_id + + +def get_celery_tesseract_task_status_by_check(check_id): + celery_tesseract_task = get_celery_tesseract_task_by_check(check_id) + if celery_tesseract_task and 'finished_at' in celery_tesseract_task: + return True + return False + + +def mark_celery_tesseract_task_as_finished_by_check(check_id, tesseract_result, finished_time=None): + celery_tesseract_task = get_celery_tesseract_task_by_check(check_id) + if not celery_tesseract_task: return + if finished_time is None: finished_time = datetime.now() + return celery_tesseract_collection.update_one({'check_id': check_id}, { + '$set': {'finished_at': finished_time, + 'tesseract_result': tesseract_result, + 'processing_time': (finished_time - celery_tesseract_task['started_at']).total_seconds()}}) + + +def get_celery_tesseract_task(celery_tesseract_task_id): + return celery_tesseract_collection.find_one({'celery_tesseract_task_id': celery_tesseract_task_id}) + + +def get_celery_tesseract_task_by_check(check_id): + return celery_tesseract_collection.find_one({'check_id': check_id}) diff --git a/app/db/db_types.py b/app/db/db_types.py index c15f7a8f..e79a7c5a 100644 --- a/app/db/db_types.py +++ b/app/db/db_types.py @@ -150,3 +150,43 @@ def none_to_false(x): is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False return {'is_ended': is_ended, 'is_failed': is_failed} + +class Image: + def __init__(self, dictionary=None): + dictionary = dictionary or {} + self._id: ObjectId = dictionary.get('_id', ObjectId()) + if isinstance(self._id, str): + self._id = ObjectId(self._id) + + self.check_id: str = dictionary.get('check_id') + self.document_id: str = dictionary.get('document_id') + self.caption: str = dictionary.get('caption', '') + self.image_data: bytes = dictionary.get('image_data') + self.image_size: tuple[int, int] = dictionary.get('image_size') + self.text: str = dictionary.get('text') + self.page: int = dictionary.get('page') + self.checksum: str = dictionary.get('checksum') + self.text_density: float = dictionary.get('text_density') + self.symbols_percentage: float = dictionary.get('symbols_percentage') + + def pack(self): + return { + "_id": self._id, + "check_id": self.check_id, + "document_id": self.document_id, + "caption": self.caption, + "image_data": self.image_data, + "image_size": self.image_size, + "text": self.text, + "page": self.page, + "checksum": self.checksum, + "text_density": self.text_density, + "symbols_percentage": self.symbols_percentage + } + +class ParsedText(PackableWithId): + def __init__(self, dictionary=None): + super().__init__(dictionary) + dictionary = dictionary or {} + self.filename = dictionary.get('filename', '') + self.parsed_chapters = dictionary.get('parsed_chapters', []) diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py index 9a865d41..66fdc4e0 100644 --- a/app/main/check_packs/pack_config.py +++ b/app/main/check_packs/pack_config.py @@ -27,6 +27,8 @@ ] BASE_REPORT_CRITERION = [ ["simple_check"], + ["image_text_check"], + ['image_quality_check'], ["banned_words_in_literature"], ["page_counter"], ["image_share_check"], diff --git a/app/main/checks/report_checks/image_quality_check.py b/app/main/checks/report_checks/image_quality_check.py new file mode 100644 index 00000000..0d2f1194 --- /dev/null +++ b/app/main/checks/report_checks/image_quality_check.py @@ -0,0 +1,54 @@ +from ..base_check import BaseReportCriterion, answer +import cv2 +import numpy as np + +class ImageQualityCheck(BaseReportCriterion): + label = "Проверка качества изображений" + _description = '' + id = 'image_quality_check' + # необходимо подобрать min_laplacian и min_entropy + def __init__(self, file_info, min_laplacian=100, min_entropy=1): + super().__init__(file_info) + self.images = self.file.images + self.min_laplacian = min_laplacian + self.min_entropy = min_entropy + self.laplacian_score = None + self.entropy_score = None + + def check(self): + deny_list = [] + if self.images: + for img in self.images: + image_array = np.frombuffer(img.image_data, dtype=np.uint8) + img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR) + + if img_cv is None: + deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.
") + continue + + self.find_params(img_cv) + + if self.laplacian_score is None or self.entropy_score is None: + deny_list.append(f"Изображение с подписью '{img.caption}' не может быть обработано.
") + continue + + if self.laplacian_score < self.min_laplacian: + deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкий показатель лапласиана: {self.laplacian_score:.2f} (минимум {self.min_laplacian:.2f}).
") + + if self.entropy_score < self.min_entropy: + deny_list.append(f"Изображение с подписью '{img.caption}' имеет низкую энтропию: {self.entropy_score:.2f} (минимум {self.min_entropy:.2f}).
") + else: + return answer(True, 'Изображения не найдены!') + if deny_list: + return answer(False, f'Изображения нечитаемы!
Попробуйте улучшить качество изображений, возможно они слишком размыты или зашумлены.
{"".join(deny_list)}') + else: + return answer(True, 'Изображения корректны!') + + def find_params(self, image): + if image is None or image.size == 0: + return None, None + gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + self.laplacian_score = cv2.Laplacian(gray_image, cv2.CV_64F).var() + hist, _ = np.histogram(gray_image.flatten(), bins=256, range=[0, 256]) + hist = hist / hist.sum() + self.entropy_score = -np.sum(hist * np.log2(hist + 1e-10)) diff --git a/app/main/checks/report_checks/image_text_check.py b/app/main/checks/report_checks/image_text_check.py new file mode 100644 index 00000000..ed531463 --- /dev/null +++ b/app/main/checks/report_checks/image_text_check.py @@ -0,0 +1,28 @@ +from ..base_check import BaseReportCriterion, answer + + +class ImageTextCheck(BaseReportCriterion): + label = "Проверка текста, считанного с изображений" + _description = '' + id = 'image_text_check' + # Подобрать значения для symbols_set, max_symbols_percentage, max_text_density + def __init__(self, file_info, symbols_set=list("@#$%^&*~`‘|±§№¤¢£€{¥}©®™•¶÷×"), max_symbols_percentage=5, max_text_density=4): + super().__init__(file_info) + self.images = self.file.images + self.symbols_set = symbols_set + self.max_symbols_percentage = max_symbols_percentage + self.max_text_density = max_text_density + + def check(self): + from app.tesseract_tasks import tesseract_recognize, callback_task + from db.db_methods import add_celery_tesseract_task + if self.images: + tesseract_task = tesseract_recognize.apply_async( + args=[self.images[0].check_id, self.symbols_set, self.max_symbols_percentage, self.max_text_density], + link=callback_task.s(self.images[0].check_id), + link_error=callback_task.s(self.images[0].check_id) + ) + add_celery_tesseract_task(tesseract_task.id, self.images[0].check_id) + return answer(True, 'Изображения проверяются!') + else: + return answer(True, 'Изображения не найдены!') diff --git a/app/main/parser.py b/app/main/parser.py index 593b8cfd..dcb33b31 100644 --- a/app/main/parser.py +++ b/app/main/parser.py @@ -8,10 +8,11 @@ from main.reports.md_uploader import MdUploader from utils import convert_to + logger = logging.getLogger('root_logger') +def parse(filepath, pdf_filepath, check_id): -def parse(filepath, pdf_filepath): tmp_filepath = filepath.lower() try: if tmp_filepath.endswith(('.odp', '.ppt', '.pptx')): @@ -19,7 +20,12 @@ def parse(filepath, pdf_filepath): if tmp_filepath.endswith(('.odp', '.ppt')): logger.info(f"Презентация {filepath} старого формата. Временно преобразована в pptx для обработки.") new_filepath = convert_to(filepath, target_format='pptx') - file_object = PresentationPPTX(new_filepath) + + presentation = PresentationPPTX(new_filepath) + presentation.extract_images_with_captions(check_id) + file_object = presentation + + elif tmp_filepath.endswith(('.doc', '.odt', '.docx', )): new_filepath = filepath if tmp_filepath.endswith(('.doc', '.odt')): @@ -29,6 +35,7 @@ def parse(filepath, pdf_filepath): docx = DocxUploader() docx.upload(new_filepath, pdf_filepath) docx.parse() + docx.extract_images_with_captions(check_id) file_object = docx elif tmp_filepath.endswith('.md' ): @@ -54,4 +61,4 @@ def save_to_temp_file(file): temp_file.write(file.read()) temp_file.close() file.seek(0) - return temp_file.name + return temp_file.name \ No newline at end of file diff --git a/app/main/presentations/pptx/presentation_pptx.py b/app/main/presentations/pptx/presentation_pptx.py index dd909f8c..a8b8581f 100644 --- a/app/main/presentations/pptx/presentation_pptx.py +++ b/app/main/presentations/pptx/presentation_pptx.py @@ -1,4 +1,7 @@ +from io import BytesIO + from pptx import Presentation +from pptx.enum.shapes import MSO_SHAPE_TYPE from .slide_pptx import SlidePPTX from ..presentation_basic import PresentationBasic @@ -17,3 +20,39 @@ def add_slides(self): def __str__(self): return super().__str__() + + def extract_images_with_captions(self, check_id): + from app.db.db_methods import save_image_to_db + + # Проход по каждому слайду в презентации + for slide in self.slides: + image_found = False + image_data = None + caption_text = None + + # Проход по всем фигурам на слайде + for shape in slide.slide.shapes: # Используем slide.slide для доступа к текущему слайду + if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: + image_found = True + image_part = shape.image # Получаем объект изображения + + # Извлекаем бинарные данные изображения + image_stream = image_part.blob + image_data = BytesIO(image_stream) + + # Если мы нашли изображение, ищем следующий непустой текст как подпись + if image_found: + for shape in slide.slide.shapes: + if not shape.has_text_frame: + continue + text = shape.text.strip() + if text: # Находим непустое текстовое поле (предположительно, это подпись) + caption_text = text + # Сохраняем изображение и его подпись + save_image_to_db(check_id, image_data.getvalue(), caption_text) + break # Предполагаем, что это подпись к текущему изображению + + # Сброс флага и данных изображения для следующего цикла + image_found = False + image_data = None + caption_text = None diff --git a/app/main/reports/document_uploader.py b/app/main/reports/document_uploader.py index d0653fae..8a6a7303 100644 --- a/app/main/reports/document_uploader.py +++ b/app/main/reports/document_uploader.py @@ -12,6 +12,7 @@ def __init__(self): self.literature_page = 0 self.first_lines = [] self.page_count = 0 + self.images = [] @abstractmethod def upload(self): diff --git a/app/main/reports/docx_uploader/docx_uploader.py b/app/main/reports/docx_uploader/docx_uploader.py index 57ac79ff..c4de4cb7 100644 --- a/app/main/reports/docx_uploader/docx_uploader.py +++ b/app/main/reports/docx_uploader/docx_uploader.py @@ -12,6 +12,7 @@ from ..document_uploader import DocumentUploader + class DocxUploader(DocumentUploader): def __init__(self): super().__init__() @@ -242,6 +243,52 @@ def show_chapters(self, work_type): chapters_str += "    " + header["text"] + "
" return chapters_str + def extract_images_with_captions(self, check_id): + from app.db.db_methods import save_image_to_db, get_images_by_check_id + + emu_to_cm = 360000 + image_found = False + image_data = None + image_style="ВКР_Подпись для рисунков" + if not self.images: + for i, paragraph in enumerate(self.file.paragraphs): + for run in paragraph.runs: + if "graphic" in run._element.xml: + image_streams = run._element.findall('.//a:blip', namespaces={ + 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}) + for image_stream in image_streams: + embed_id = image_stream.get( + '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') + if embed_id: + image_found = True + image_part = self.file.part.related_parts[embed_id] + image_data = image_part.blob + extent = run._element.find('.//wp:extent', namespaces={ + 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) + width_cm = height_cm = None + if extent is not None: + width_cm = int(extent.get('cx')) / emu_to_cm + height_cm = int(extent.get('cy')) / emu_to_cm + if image_found: + caption = "picture without caption" + next_paragraph_index = i + 1 + while next_paragraph_index < len(self.file.paragraphs): + next_paragraph = self.file.paragraphs[next_paragraph_index] + style_name = next_paragraph.style.name.lower() + next_text = next_paragraph.text.strip() + if any("graphic" in r._element.xml for r in next_paragraph.runs): + break + elif next_text and style_name == image_style.lower() and 'Рисунок' in next_text: + caption = next_text + break + next_paragraph_index += 1 + save_image_to_db(check_id, image_data, caption, (width_cm, height_cm)) + image_found = False + image_data = None + + self.images = get_images_by_check_id(check_id) + + def main(args): file = args.file diff --git a/app/main/reports/pasre_file/parse_file.py b/app/main/reports/pasre_file/parse_file.py new file mode 100644 index 00000000..25e7fe1f --- /dev/null +++ b/app/main/reports/pasre_file/parse_file.py @@ -0,0 +1,37 @@ +import re +from db import db_methods + +def parse_headers_and_pages_and_images(chapters, docx): + text_on_page = docx.pdf_file.get_text_on_page() + images = docx.images + for page, text in text_on_page.items(): + text = re.sub(r"(-\n)", "", text) + text = re.sub(r"\s\n", " ", text) + if "СОДЕРЖАНИЕ" in text: + continue + for chapter in chapters: + if chapter["header"] in text: + chapter["start_page"] = page + for image in images: + if image.caption in text: + image.page = page + db_methods.update_image(image) + for chapter in chapters: + for image in images: + if image.caption in chapter["text"]: + chapter["images"].append(image._id) + return chapters + + +def parse_chapters(docx): + chapters = [] + for chapter in docx.chapters: + head = chapter["styled_text"]["text"] + if "ПРИЛОЖЕНИЕ" in head: + head = head.split(".")[0] + if chapter["child"] != [] and "heading" in chapter["style"]: + temp_text = "" + for i in range(len(chapter["child"])): + temp_text += chapter["child"][i]["styled_text"]["text"] + chapters.append({"header": head, "start_page": 0, "text": temp_text, "images": []}) + return chapters \ No newline at end of file diff --git a/app/server.py b/app/server.py index 986dcd48..cb6269b0 100644 --- a/app/server.py +++ b/app/server.py @@ -178,4 +178,4 @@ def __call__(self, environ, start_response): logger.info("Сервер запущен по адресу http://" + str(ip) + ':' + str(port) + " в " + ("отладочном" if DEBUG else "рабочем") + " режиме") utils.create_consumers(app.config['LTI_CONSUMERS']) - app.run(debug=DEBUG, host=ip, port=8080, use_reloader=True) + app.run(debug=DEBUG, host=ip, port=8080, use_reloader=True) \ No newline at end of file diff --git a/app/tasks.py b/app/tasks.py index 7c066e04..68be2f97 100644 --- a/app/tasks.py +++ b/app/tasks.py @@ -6,11 +6,13 @@ from celery.signals import worker_ready from passback_grades import run_passback +from main.reports.pasre_file import parse_file from db import db_methods -from db.db_types import Check +from db.db_types import Check, ParsedText from main.checker import check from main.parser import parse from root_logger import get_root_logger +from tesseract_tasks import update_tesseract_criteria_result config = ConfigParser() config.read('app/config.ini') @@ -51,9 +53,18 @@ def create_task(self, check_info): original_filepath = join(FILES_FOLDER, f"{check_id}.{check_obj.filename.rsplit('.', 1)[-1]}") pdf_filepath = join(FILES_FOLDER, f"{check_id}.pdf") try: - updated_check = check(parse(original_filepath, pdf_filepath), check_obj) - updated_check.is_ended = True + parsed_file_object = parse(original_filepath, pdf_filepath, check_id) + parsed_file_object.make_chapters(check_obj.file_type['report_type']) + parsed_file_object.make_headers(check_obj.file_type['report_type']) + chapters = parse_file.parse_chapters(parsed_file_object) + + updated_check = check(parsed_file_object, check_obj) updated_check.is_failed = False + parsed_text = ParsedText(dict(filename=check_info['filename'])) + parsed_text.parsed_chapters = parse_file.parse_headers_and_pages_and_images(chapters, parsed_file_object) + db_methods.add_parsed_text(check_id, parsed_text) + if db_methods.get_celery_tesseract_task_status_by_check(check_id): + update_tesseract_criteria_result(updated_check) db_methods.update_check(updated_check) # save to db db_methods.mark_celery_task_as_finished(self.request.id) diff --git a/app/tesseract_tasks.py b/app/tesseract_tasks.py new file mode 100644 index 00000000..10e84cdf --- /dev/null +++ b/app/tesseract_tasks.py @@ -0,0 +1,153 @@ +import os +import time +from celery import Celery +from celery.exceptions import SoftTimeLimitExceeded, MaxRetriesExceededError +import pytesseract +import cv2 +import numpy as np +from root_logger import get_root_logger +from db import db_methods +import re +from bson import ObjectId +from main.check_packs.pack_config import BASE_REPORT_CRITERION + +TASK_RETRY_COUNTDOWN = 30 +SOFT_TIME_LIMIT_FOR_CALLBACK = 30 +MAX_RETRIES = 1 +TASK_SOFT_TIME_LIMIT = 120 + +logger = get_root_logger('tesseract_tasks') + +celery = Celery(__name__) +celery.conf.broker_url = os.environ.get("CELERY_BROKER_URL", "redis://redis:6379") +celery.conf.result_backend = os.environ.get("CELERY_RESULT_BACKEND", "redis://redis:6379") + +celery.conf.timezone = 'Europe/Moscow' + +TESSERACT_CONFIG = { + 'lang': 'rus+eng', + 'config': '--psm 6', +} + +@celery.task(name="tesseract_recognize", queue='tesseract-queue', bind=True, max_retries=MAX_RETRIES, soft_time_limit=TASK_SOFT_TIME_LIMIT) +def tesseract_recognize(self, check_id, symbols_set, max_symbols_percentage, max_text_density): + try: + images = db_methods.get_images_by_check_id(check_id) + if images: + for image in images: + image_array = np.frombuffer(image.image_data, dtype=np.uint8) + img_cv = cv2.imdecode(image_array, cv2.IMREAD_COLOR) + if img_cv is None: + raise ValueError(f"Не удалось декодировать изображение с подписью '{image.caption}' из двоичных данных") + text = image.text + if not text: + text = pytesseract.image_to_string(img_cv, **TESSERACT_CONFIG) + if text.strip(): + logger.info(f"Текст успешно распознан для изображения с подписью '{image.caption}'") + else: + logger.info(f"Текст для изображения с подписью '{image.caption}' пустой.") + try: + image.text = (re.sub(r'\s+', ' ', text)).strip() + db_methods.update_image(image) + except Exception as e: + raise ValueError(f"Ошибка при сохранении текста для изображения с подписью '{image.caption}': {e}") + try: + update_ImageTextCheck(check_id, symbols_set, max_symbols_percentage, max_text_density) + except Exception as e: + raise ValueError(f"Ошибка во время проверки текста: {e}") + except SoftTimeLimitExceeded: + logger.warning(f"Превышен мягкий лимит времени для check_id: {check_id}. Задача будет перезапущена.") + try: + self.retry(countdown=TASK_RETRY_COUNTDOWN) + except MaxRetriesExceededError: + logger.error(f"Достигнут лимит повторных попыток для check_id: {check_id}") + add_tesseract_result(check_id, [[f"Превышен лимит времени и попыток"], 0]) + except Exception as e: + logger.error(f"Ошибка при распознавании текста для check_id: {check_id}: {e}", exc_info=True) + try: + self.retry(countdown=TASK_RETRY_COUNTDOWN) + except MaxRetriesExceededError: + logger.error(f"Достигнут лимит повторных попыток для check_id: {check_id}") + add_tesseract_result(check_id,[[f"Ошибка при распознавании текста: {e}"], 0]) + + +@celery.task(name="callback_task", queue='callback-queue', soft_time_limit=SOFT_TIME_LIMIT_FOR_CALLBACK) +def callback_task(result, check_id): + try: + time.sleep(10) + check = db_methods.get_check(ObjectId(check_id)) + if db_methods.get_celery_task_status_by_check(ObjectId(check_id)): + if check.is_ended: + logger.info(f"Проверка успешно завершена для check_id: {check_id}") + return + update_tesseract_criteria_result(check) + db_methods.update_check(check) + logger.info(f"Проверка успешно обновлена для check_id: {check_id}") + return + else: + logger.info(f"Задачи create_task и tesseract_recognize для check_id: {check_id} обрабатываются корректно. Состояние гонки исключено.") + return + except SoftTimeLimitExceeded: + logger.warning(f"Превышен мягкий лимит времени для callback_task с check_id: {check_id}.") + except Exception as e: + logger.error(f"Ошибка в callback_task для check_id: {check_id}: {e}") + + +def update_ImageTextCheck(check_id, symbols_set, max_symbols_percentage, max_text_density): + images = db_methods.get_images_by_check_id(check_id) + deny_list = [] + for image in images: + if image.text: + width, height = image.image_size + text_density = calculate_text_density(image.text, width * height) + image.text_density = text_density + if text_density > max_text_density: + deny_list.append( + f"Изображение с подписью '{image.caption}' имеет слишком высокую плотность текста: " + f"{text_density:.2f} (максимум {max_text_density:.2f}). Это может означать, что текст нечитаем.
" + ) + symbols_count = count_symbols_in_text(image.text, symbols_set) + text_length = len(image.text) + symbols_percentage = (symbols_count / text_length) * 100 + image.symbols_percentage = symbols_percentage + if symbols_percentage > max_symbols_percentage: + deny_list.append( + f"На изображении с подписью '{image.caption}' содержится слишком много неверно распознанных символов: " + f"{symbols_percentage:.2f}% (максимум {max_symbols_percentage:.2f}%). Это может означать, что размер шрифта слишком маленький или текст нечитаем.
" + ) + db_methods.update_image(image) + if deny_list: + result = [[f'Проблемы с текстом на изображениях!
{"".join(deny_list)}'], 0] + else: + result = [['Текст на изображениях корректен!'], 1] + add_tesseract_result(check_id, result) + + +def add_tesseract_result(check_id, result): + updated_check = db_methods.get_check(ObjectId(check_id)) + db_methods.mark_celery_tesseract_task_as_finished_by_check(check_id, result) + if db_methods.get_celery_task_status_by_check(check_id): + update_tesseract_criteria_result(updated_check) + db_methods.update_check(updated_check) + + +def update_tesseract_criteria_result(check): + tesseract_task = db_methods.get_celery_tesseract_task_by_check(str(check._id)) + for criteria in check.enabled_checks: + if criteria["id"] == 'image_text_check': + criteria["verdict"] = tesseract_task['tesseract_result'][0] + criteria["score"] = tesseract_task['tesseract_result'][1] + check.score = max(0, round(check.score - (1 - tesseract_task['tesseract_result'][1]) / len(BASE_REPORT_CRITERION), 3)) + check.is_ended = True + return + + +def count_symbols_in_text(text, symbols_set): + return sum(1 for char in text if char in symbols_set) + + +def calculate_text_density(text, image_area): + text_without_spaces = ''.join(text.split()) + if image_area == 0: + return 0 + return len(text_without_spaces) / image_area \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 48449c1b..5d44c350 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -79,6 +79,23 @@ services: volumes: - ../dis_mongo_data:/data/db cpuset: ${CONTAINER_CPU:-0-1} + + tesseract_worker: + image: document_insight_system_image + restart: always + command: celery --app=app.tesseract_tasks.celery worker -n tesseract@worker -Q tesseract-queue,callback-queue --loglevel=info + environment: + - CELERY_BROKER_URL=${REDIS_URL} + - CELERY_RESULT_BACKEND=${REDIS_URL} + depends_on: + - redis + - mongodb + volumes: + - files:/usr/src/project/files/ + - "/etc/timezone:/etc/timezone:ro" + - "/etc/localtime:/etc/localtime:ro" + cpuset: ${CONTAINER_CPU:-0-1} + mem_limit: ${WORKER_MEMORY:-1G} volumes: flower_data: diff --git a/requirements.txt b/requirements.txt index 5bf13795..3dfb2fc4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -32,4 +32,6 @@ redis==6.1.0 requests~=2.31.0 scipy~=1.11.1 urllib3~=2.0.3 -werkzeug==2.0.0 \ No newline at end of file +werkzeug==2.0.0 +opencv-python==4.5.5.64 +pytesseract==0.3.10 \ No newline at end of file