init

2025-09-26 17:15:54 +08:00
commit db0e5965ec
211 changed files with 40437 additions and 0 deletions
--- a/vw-document-ai-indexer/utils.py
+++ b/vw-document-ai-indexer/utils.py
@@ -0,0 +1,334 @@
+import shutil
+from dataclasses import fields
+import json
+import os
+import logging
+from datetime import datetime
+from decimal import Decimal
+import random
+from typing import Any, List, Optional, Union
+import string
+from PIL import Image
+import tiktoken
+from PIL.Image import Resampling
+
+from entity_models import Document, FigureFlat
+
+
+class TokenEstimator(object):
+    GPT2_TOKENIZER = tiktoken.get_encoding("gpt2")
+
+    def estimate_tokens(self, text: str) -> int:
+
+        return len(self.GPT2_TOKENIZER.encode(text, allowed_special="all"))
+
+    def construct_tokens_with_size(self, tokens: str, numofTokens: int) -> str:
+        newTokens = self.GPT2_TOKENIZER.decode(
+            self.GPT2_TOKENIZER.encode(tokens, allowed_special="all")[:numofTokens]
+        )
+        return newTokens
+
+TOKEN_ESTIMATOR = TokenEstimator()
+
+
+def generate_random_name(length:int=12):
+    # Characters to use: letters and digits
+    characters = string.ascii_letters + string.digits
+    # Randomly select `length` characters
+    folder_name = ''.join(random.choices(characters, k=length))
+    return folder_name
+            
+def asdict_with_dynamic(obj:Any) -> dict[str, Any]:
+    """Returns a dictionary containing dynamic attributes"""
+    # Use predefined fields as the basis
+    result = {f.name: getattr(obj, f.name) for f in fields(obj)}
+    # Add dynamic attributes
+    all_attrs = dir(obj)
+    predefined_attrs = [f.name for f in fields(obj)]
+    for attr in all_attrs:
+        # Skip special attributes, private attributes, methods, and predefined attributes
+        if (
+            not attr.startswith("__")
+            and not callable(getattr(obj, attr))
+            and attr not in predefined_attrs
+        ):
+            result[attr] = getattr(obj, attr)
+    return result
+
+
+
+def write_log(message: str):
+    """Write log message (INFO level) to data_preparation logger."""
+    logging.getLogger("data_preparation").info(msg=message)
+
+def init_current_data_directory(base_path:str) -> str:
+    """Initialize the current data directory and return its path."""
+    folder_name = generate_random_name(10)
+    if base_path == "":
+        base_path = os.path.expanduser("~")
+    # Create the directory path
+    local_data_folder = os.path.join(base_path , "doc-extractor", folder_name)
+    os.makedirs(local_data_folder, exist_ok=True)
+    return local_data_folder
+
+def write_content(content: str, directory_path: str, file_name: str):
+    """Write merged content to a markdown file in the .extracted directory, and optionally upload to blob storage."""
+    output_folder = directory_path + "/.extracted/" + file_name
+    os.makedirs(f"{output_folder}", exist_ok=True) 
+    with open(f"{output_folder}/_merged.md", "w", encoding="utf-8") as file:
+        file.write(content)
+
+    print(f"Merged Saved: {output_folder}/_merged.md")
+
+def write_object(obj: Any, directory_path: str, file_name: str):
+    """Write a dictionary to a JSON file in the specified directory."""
+    output_folder = directory_path + "/.extracted/" + file_name
+    os.makedirs(f"{output_folder}", exist_ok=True)
+    with open(f"{output_folder}/_merged.json", "w", encoding="utf-8") as file:
+        json.dump(obj, file, indent=4, ensure_ascii=False, default=custom_serializer)
+    print(f"Dict Saved: {output_folder}/_merged.json")
+
+def write_document(documents: list[Document], file_path: str, directory_path: str, rel_file_path: str):
+    """Write the parsed document list to a JSON file in the specified directory."""
+    chunks_save = []
+    for chunk_idx, chunk_doc in enumerate(documents):
+        chunk_doc.filepath = rel_file_path
+        chunk_doc.metadata = json.dumps({"chunk_id": str(chunk_idx)})
+        chunk_doc.image_mapping = json.dumps(chunk_doc.image_mapping) if chunk_doc.image_mapping else None
+        chunks_save.append(asdict_with_dynamic(chunk_doc))
+
+    output_folder = directory_path + "/.chunked"
+    os.makedirs(f"{output_folder}", exist_ok=True)
+    with open(f"{output_folder}/{rel_file_path}.json", "w", encoding="utf-8") as file:
+        file.write(json.dumps(chunks_save, indent=4, ensure_ascii=False))
+    print(f"Processed {file_path} to {len(documents)} chunks. Document Schema: {documents[0].document_schema}")
+    print(f"Saved Result: {output_folder}/{rel_file_path}.json")
+
+
+# Custom serializer function
+def custom_serializer(obj:Any)->Any:
+    """Handle types that cannot be serialized by JSON"""
+    if isinstance(obj, datetime):
+        return obj.isoformat()  # Convert to ISO 8601 string
+    elif isinstance(obj, Decimal):
+        return float(obj)      # Decimal to float
+    elif hasattr(obj, '__dict__'):
+        return obj.__dict__    # Class object to dict
+    else:
+        raise TypeError(f"Type {type(obj)} cannot be JSON serialized")
+
+
+def keep_latest(data_list: list[dict[str,Any]] , id_key:str, timestamp_key:Optional[str]='')->list[dict[str,Any]]:
+    """
+    Advanced method to keep the latest records
+
+    Args:
+        data_list: List of dictionaries containing records
+        id_key: Key to identify the entity
+        timestamp_key: Timestamp key (optional, if not provided, keep the last occurrence)
+
+    Returns:
+        List of the latest records for each entity
+    """
+    latest_dict = {}
+
+    for idx, record in enumerate(data_list):
+        entity_id = record[id_key]
+
+        # If no timestamp, keep the last occurrence by position
+        if timestamp_key is None or timestamp_key not in record:
+            # Record index to handle same id cases
+            latest_dict[entity_id] = (idx, record)
+            continue
+
+        current_time = record[timestamp_key]
+
+        # If the current record is newer, update
+        if entity_id not in latest_dict or current_time > latest_dict[entity_id][1][timestamp_key]:
+            latest_dict[entity_id] = (idx, record)
+
+    # Sort by original position (optional)
+    return [record for _, record in sorted(latest_dict.values(), key=lambda x: x[0])]
+
+
+def max_datetime_safe(
+        dt1: Union[datetime, None],
+        dt2: Union[datetime, None]
+) -> Union[datetime, None]:
+    """
+    Safely get the maximum of two datetimes, handling None values
+
+    Args:
+        dt1: First datetime (may be None)
+        dt2: Second datetime (may be None)
+
+    Returns:
+        The maximum datetime, or None if both are None
+    """
+    if dt1 is None:
+        return dt2
+    if dt2 is None:
+        return dt1
+    return max(dt1, dt2)
+
+
+def min_datetime_safe(
+        dt1: Union[datetime, None],
+        dt2: Union[datetime, None]
+) -> Union[datetime, None]:
+    """
+    Safely get the minimum of two datetimes, handling None values
+
+    Rules:
+    - Both datetimes are None → return None
+    - One datetime is None → return the other
+    - Both datetimes are not None → return the smaller one
+
+    Args:
+        dt1: First datetime (may be None)
+        dt2: Second datetime (may be None)
+
+    Returns:
+        The minimum datetime, or None if both are None
+    """
+    if dt1 is None:
+        return dt2
+    if dt2 is None:
+        return dt1
+    return min(dt1, dt2)
+
+
+def write_json_to_file(data: list[dict], filename: str):
+    """Write data to a JSON file."""
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    with open(filename, "w", encoding="utf-8") as file:
+        json.dump(data, file, indent=4, ensure_ascii=False, default=custom_serializer)
+    print(f"JSON file saved: {filename}")
+
+
+def write_grouped_index_files(to_upload_dicts: list[dict[str,Any]],index_name:str, base_directory: str = ""):
+    """
+    Write to the corresponding json file in the .index directory, grouped by the filepath field in to_upload_dicts
+    
+    Args:
+        to_upload_dicts: List of dictionaries to upload
+        base_directory: Basic directory path
+    """
+    if not to_upload_dicts:
+        print("No data to write.")
+        return
+
+    # Group by filepath field
+    grouped_data = {}
+    for item in to_upload_dicts:
+        filepath = item.get("filepath", "unknown")
+        if filepath not in grouped_data:
+            grouped_data[filepath] = []
+        grouped_data[filepath].append(item)
+    
+    # Create .index directory
+    index_dir = os.path.join(base_directory, ".index")
+    os.makedirs(index_dir, exist_ok=True)
+
+    # Create corresponding json files for each filepath
+    for filepath, items in grouped_data.items():
+        # Convert filepath to a safe filename
+        safe_filename = filepath.replace("/", "_").replace("\\", "_").replace(":", "_")
+        if safe_filename.endswith(".pdf"):
+            safe_filename = safe_filename[:-4]  # Remove .pdf extension
+        
+        json_filename = f"{safe_filename}.{index_name}.json"
+        json_filepath = os.path.join(index_dir, json_filename)
+
+        # Write JSON file
+        with open(json_filepath, "w", encoding="utf-8") as file:
+            json.dump(items, file, indent=4, ensure_ascii=False, default=custom_serializer)
+        
+        print(f"Grouped index file saved: {json_filepath} (contains {len(items)} items)")
+    
+    print(f"Total {len(grouped_data)} files written to .index directory")
+    
+    
+
+def replace_urls_in_content(content:str, replacements: List[FigureFlat])->str:
+    """
+    Insert URLs from the replacement list into the specified positions in the content
+
+    :param content: Original text content
+    :param replacements: Replacement list, each element contains:
+        - 'url': Image URL
+        - 'offset': Offset in the original content
+        - 'length': Length of the text to be replaced
+    :return: New content with replacements
+    """
+    if not replacements:
+        return content
+
+    # Sort by offset in descending order (process in reverse order)
+    sorted_replacements = sorted(replacements, key=lambda x: x.offset, reverse=True)
+
+    # List to store text fragments
+    fragments = []
+    current_index = len(content)  # Current position (start from the end)
+
+    for item in sorted_replacements:
+        url = f"![{item.content}]({item.url})"
+        offset = item.offset
+        length = item.length
+
+        # Check offset validity
+        if offset >= current_index:
+            continue  # Skip invalid offset
+
+        # Calculate actual end position for replacement
+        end_pos = min(offset + length, current_index)
+
+        # 1. Add text between current position and end of replacement
+        fragments.append(content[end_pos:current_index])
+
+        # 2. Add URL (replace original content)
+        fragments.append(url)
+
+        # Update current position to start of replacement
+        current_index = offset
+
+    # Add remaining head content
+    fragments.append(content[:current_index])
+
+    # Concatenate fragments in reverse order (since processed backwards)
+    return ''.join(fragments[::-1])
+
+
+def resize_image(input_path:str, output_path:str=None, max_size:int=10000)->str:
+    """Scaling PNG pictures in an equal ratio to ensure that the length and width do not exceed max_size pixels"""
+    with Image.open(input_path) as img:
+        # Calculate the scaling ratio
+        ratio = min(max_size / max(img.size), 1.0)
+
+        if ratio >= 1:  # No scaling required
+            return input_path
+
+        # Calculate new dimensions (maintain aspect ratio)
+        new_size = tuple(round(dim * ratio) for dim in img.size)
+
+        # Using high-quality scaling algorithm
+        resized_img = img.resize(new_size, Resampling.LANCZOS)
+
+        # Process the output path
+        if not output_path:
+            filename, ext = os.path.splitext(input_path)
+            output_path = f"{filename}_resized{ext}"
+
+        # Save the zoomed image (preserve PNG features)
+        resized_img.save(output_path, format="PNG", optimize=True)
+        print(f"Images have been scaled:{img.size} → {new_size} | Save to: {output_path}")
+        return output_path
+
+def file_rename(input_path:str)->str:
+    filename, ext = os.path.splitext(input_path)
+    if ext.lower() == ".doc":
+        new_path = f"{filename}.docx"
+        shutil.copy2(input_path, new_path)
+        print("file renamed to ", new_path)
+        return new_path
+    return input_path