diff --git a/.DS_Store b/.DS_Store index 72b17cc..efee48e 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index 968e59d..c5f3b2c 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ mbox-files/* +output +output/* +trash +trash/* diff --git a/Readme.md b/Readme.md index ac800ca..0967ea3 100644 --- a/Readme.md +++ b/Readme.md @@ -1,7 +1,13 @@ ## Mbox to Markdown converter +This simple docker image can be used to convert a .mbox file (for example from a google gmail account export / take out) to convert it to markdown files. +I used it for archive purposes, wanted to store my old gmail emails. +Nothing fancy, but usefull for those who dont want to use online converters nor mess with python directly. + +## Usage: + +``` docker build -t mbox-to-markdown . -docker run --rm -v ./mbox-files:/mnt/input -v /path/to/output/directory:/mnt/output mbox-to-markdown python mbox_to_markdown.py /mnt/input/yourfile.mbox /mnt/output/ - - +docker run --rm -v ./mbox-files:/mnt/input -v ./output/:/mnt/output mbox-to-markdown +``` diff --git a/mbox_to_markdown.py b/mbox_to_markdown.py index 3db2446..7f9b5d2 100644 --- a/mbox_to_markdown.py +++ b/mbox_to_markdown.py @@ -1,31 +1,114 @@ import mailbox import os +import re +import logging +from datetime import datetime +from email.utils import parsedate_tz, mktime_tz from markdownify import markdownify +from watchdog.observers import Observer +from watchdog.events import FileSystemEventHandler +from tqdm import tqdm # Configuration -mbox_file = 'path/to/your/file.mbox' -output_dir = 'path/to/output/directory' +input_dir = '/mnt/input' +output_dir = '/mnt/output' + +# Setup logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) # Ensure output directory exists os.makedirs(output_dir, exist_ok=True) -def save_email_as_markdown(email, index): +def sanitize_filename(filename): + """Sanitize the filename to remove invalid characters.""" + return re.sub(r'[<>:"/\\|?*\x00-\x1F]', '', filename) + +def format_date(date_str): + """Format the date string to be suitable for filenames.""" + try: + parsed_date = parsedate_tz(date_str) + if parsed_date is not None: + timestamp = mktime_tz(parsed_date) + formatted_date = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d_%H-%M-%S') + return formatted_date + except Exception as e: + logger.error(f"Error formatting date: {e}") + return 'NoDate' + +def save_email_as_markdown(email, index, output_subdir): subject = email.get('subject', 'No Subject') date = email.get('date', 'No Date') - body = email.get_payload(decode=True).decode(errors='ignore') - body_markdown = markdownify(body) + sender = email.get('from', 'Unknown Sender') + recipients = email.get('to', 'Unknown Recipient') + + # Sanitize and format the filename + sanitized_subject = sanitize_filename(subject) + sanitized_sender = sanitize_filename(sender.split('<')[0].strip()) + sanitized_recipients = sanitize_filename(recipients.split(',')[0].strip()) + formatted_date = format_date(date) + filename = f"{sanitized_subject} - {sanitized_sender} - {sanitized_recipients} - {formatted_date}.md" + filename = os.path.join(output_subdir, filename) + + # Handle potential None payload + payload = email.get_payload(decode=True) + if payload is None: + body_markdown = "No content available" + else: + try: + body = payload.decode(errors='ignore') + body_markdown = markdownify(body) + except (UnicodeDecodeError, AttributeError) as e: + body_markdown = f"Error decoding content: {e}" # Create a Markdown file for each email - filename = os.path.join(output_dir, f'email_{index}.md') with open(filename, 'w', encoding='utf-8') as file: file.write(f'# {subject}\n') - file.write(f'*Date: {date}*\n\n') + file.write(f'*Date: {date}*\n') + file.write(f'*From: {sender}*\n') + file.write(f'*To: {recipients}*\n\n') file.write(body_markdown) + logger.info(f"Saved email {index + 1} as Markdown: {filename}") + def convert_mbox_to_markdown(mbox_file): + # Create a subdirectory in the output directory with the name of the .mbox file + base_name = os.path.basename(mbox_file) + subdir_name = os.path.splitext(base_name)[0] # Remove the .mbox extension + output_subdir = os.path.join(output_dir, subdir_name) + os.makedirs(output_subdir, exist_ok=True) + + logger.info(f"Processing .mbox file: {mbox_file}") mbox = mailbox.mbox(mbox_file) - for i, email in enumerate(mbox): - save_email_as_markdown(email, i) -convert_mbox_to_markdown(mbox_file) + # Show progress bar + total_emails = len(mbox) + logger.info(f"Total emails to process: {total_emails}") + + for i, email in tqdm(enumerate(mbox), total=total_emails, desc='Converting emails'): + save_email_as_markdown(email, i, output_subdir) + +class MboxFileHandler(FileSystemEventHandler): + def on_created(self, event): + if event.is_directory: + return + if event.src_path.endswith('.mbox'): + logger.info(f"New .mbox file detected: {event.src_path}") + convert_mbox_to_markdown(event.src_path) + +def start_watching(): + event_handler = MboxFileHandler() + observer = Observer() + observer.schedule(event_handler, path=input_dir, recursive=False) + observer.start() + logger.info(f"Watching for new .mbox files in {input_dir}...") + try: + while True: + pass # Keep the script running + except KeyboardInterrupt: + observer.stop() + observer.join() + +if __name__ == "__main__": + start_watching() diff --git a/requirements.txt b/requirements.txt index 3fafbc9..666fd82 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ mailbox markdownify +watchdog +tqdm