diff --git a/mbox_to_markdown.py b/mbox_to_markdown.py index da3d7b0..123d262 100644 --- a/mbox_to_markdown.py +++ b/mbox_to_markdown.py @@ -36,6 +36,24 @@ def format_date(date_str): logger.error(f"Error formatting date: {e}") return 'NoDate' +def extract_email_content(email): + """Extract the email content, prioritizing text/plain over text/html.""" + if email.is_multipart(): + for part in email.walk(): + content_type = part.get_content_type() + disposition = str(part.get('Content-Disposition')) + if content_type == 'text/plain' and 'attachment' not in disposition: + return part.get_payload(decode=True).decode(errors='ignore') + elif content_type == 'text/html' and 'attachment' not in disposition: + return markdownify(part.get_payload(decode=True).decode(errors='ignore')) + else: + content_type = email.get_content_type() + if content_type == 'text/plain': + return email.get_payload(decode=True).decode(errors='ignore') + elif content_type == 'text/html': + return markdownify(email.get_payload(decode=True).decode(errors='ignore')) + return "No content available" + def save_email_as_markdown(email, index, output_subdir): logger.info(f"Starting to process email {index + 1}") try: @@ -52,16 +70,8 @@ def save_email_as_markdown(email, index, output_subdir): filename = f"{sanitized_subject} - {sanitized_sender} - {sanitized_recipients} - {formatted_date}.md" filename = os.path.join(output_subdir, filename) - # Handle potential None payload - payload = email.get_payload(decode=True) - if payload is None: - body_markdown = "No content available" - else: - try: - body = payload.decode(errors='ignore') - body_markdown = markdownify(body) - except (UnicodeDecodeError, AttributeError) as e: - body_markdown = f"Error decoding content: {e}" + # Extract email content + body_markdown = extract_email_content(email) # Create a Markdown file for each email with open(filename, 'w', encoding='utf-8') as file: