From c3065028a3ba1de07642a6606bf66c7a4142e09e Mon Sep 17 00:00:00 2001 From: "friedemann.blume" Date: Fri, 19 Jul 2024 13:41:51 +0200 Subject: [PATCH] v1.1 --- mbox_to_markdown.py | 92 +++++++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/mbox_to_markdown.py b/mbox_to_markdown.py index 7f9b5d2..da3d7b0 100644 --- a/mbox_to_markdown.py +++ b/mbox_to_markdown.py @@ -37,56 +37,66 @@ def format_date(date_str): return 'NoDate' def save_email_as_markdown(email, index, output_subdir): - subject = email.get('subject', 'No Subject') - date = email.get('date', 'No Date') - sender = email.get('from', 'Unknown Sender') - recipients = email.get('to', 'Unknown Recipient') + logger.info(f"Starting to process email {index + 1}") + try: + subject = email.get('subject', 'No Subject') + date = email.get('date', 'No Date') + sender = email.get('from', 'Unknown Sender') + recipients = email.get('to', 'Unknown Recipient') - # Sanitize and format the filename - sanitized_subject = sanitize_filename(subject) - sanitized_sender = sanitize_filename(sender.split('<')[0].strip()) - sanitized_recipients = sanitize_filename(recipients.split(',')[0].strip()) - formatted_date = format_date(date) - filename = f"{sanitized_subject} - {sanitized_sender} - {sanitized_recipients} - {formatted_date}.md" - filename = os.path.join(output_subdir, filename) + # Sanitize and format the filename + sanitized_subject = sanitize_filename(subject) + sanitized_sender = sanitize_filename(sender.split('<')[0].strip()) + sanitized_recipients = sanitize_filename(recipients.split(',')[0].strip()) + formatted_date = format_date(date) + filename = f"{sanitized_subject} - {sanitized_sender} - {sanitized_recipients} - {formatted_date}.md" + filename = os.path.join(output_subdir, filename) - # Handle potential None payload - payload = email.get_payload(decode=True) - if payload is None: - body_markdown = "No content available" - else: - try: - body = payload.decode(errors='ignore') - body_markdown = markdownify(body) - except (UnicodeDecodeError, AttributeError) as e: - body_markdown = f"Error decoding content: {e}" + # Handle potential None payload + payload = email.get_payload(decode=True) + if payload is None: + body_markdown = "No content available" + else: + try: + body = payload.decode(errors='ignore') + body_markdown = markdownify(body) + except (UnicodeDecodeError, AttributeError) as e: + body_markdown = f"Error decoding content: {e}" - # Create a Markdown file for each email - with open(filename, 'w', encoding='utf-8') as file: - file.write(f'# {subject}\n') - file.write(f'*Date: {date}*\n') - file.write(f'*From: {sender}*\n') - file.write(f'*To: {recipients}*\n\n') - file.write(body_markdown) + # Create a Markdown file for each email + with open(filename, 'w', encoding='utf-8') as file: + file.write(f'# {subject}\n') + file.write(f'*Date: {date}*\n') + file.write(f'*From: {sender}*\n') + file.write(f'*To: {recipients}*\n\n') + file.write(body_markdown) - logger.info(f"Saved email {index + 1} as Markdown: {filename}") + logger.info(f"Saved email {index + 1} as Markdown: {filename}") + except Exception as e: + logger.error(f"Error processing email {index + 1}: {e}") def convert_mbox_to_markdown(mbox_file): - # Create a subdirectory in the output directory with the name of the .mbox file - base_name = os.path.basename(mbox_file) - subdir_name = os.path.splitext(base_name)[0] # Remove the .mbox extension - output_subdir = os.path.join(output_dir, subdir_name) - os.makedirs(output_subdir, exist_ok=True) + try: + # Create a subdirectory in the output directory with the name of the .mbox file + base_name = os.path.basename(mbox_file) + subdir_name = os.path.splitext(base_name)[0] # Remove the .mbox extension + output_subdir = os.path.join(output_dir, subdir_name) + os.makedirs(output_subdir, exist_ok=True) - logger.info(f"Processing .mbox file: {mbox_file}") - mbox = mailbox.mbox(mbox_file) + logger.info(f"Processing .mbox file: {mbox_file}") + mbox = mailbox.mbox(mbox_file) - # Show progress bar - total_emails = len(mbox) - logger.info(f"Total emails to process: {total_emails}") + # Show progress bar + total_emails = len(mbox) + logger.info(f"Total emails to process: {total_emails}") - for i, email in tqdm(enumerate(mbox), total=total_emails, desc='Converting emails'): - save_email_as_markdown(email, i, output_subdir) + for i, email in tqdm(enumerate(mbox), total=total_emails, desc='Converting emails'): + logger.info(f"Processing email {i + 1}/{total_emails}") + save_email_as_markdown(email, i, output_subdir) + + logger.info(f"Completed processing {mbox_file}") + except Exception as e: + logger.error(f"Error processing mbox file {mbox_file}: {e}") class MboxFileHandler(FileSystemEventHandler): def on_created(self, event):