Remove Duplicate Messages
Summary: Removes duplicate e-mail messages from a mailbox.
Requires: EagleFiler 1.6.6
Install Location: ~/Library/Scripts/Applications/EagleFiler/
Last Modified: 2022-08-11
Description
This script scans the selected mailboxes in the records list or the source list. (Prior to EagleFiler 1.7, the mailboxes must be selected in the records list; with 1.7 or later, the script will use whichever list has focus.) If a mailbox contains duplicate copies of the same e-mail message, the script creates a new mailbox with the duplicate e-mails removed. (If your duplicates are spread across multiple mailboxes or are stored in .eml files, you should first merge them.) It then moves the original mailbox to the trash. This script has the following limitations:
- Any tags or notes attached to messages within the mailbox will be lost.
- If the mailbox uses DOS or Mac linebreaks, it will be converted to use Unix linebreaks.
- It only removes duplicate messages that contain a valid Message-ID header. (This header is required by Internet standards, but it is missing from some rare e-mails such as spams.)
- The mailbox’s EagleFiler metadata is preserved, except that any rich text in the notes is converted to plain text.
- The new mailbox file is a different EagleFiler record; any x-eaglefiler links to the old mailbox will continue pointing to the old record (now in the trash).
Mailboxes of a few MB will be processed very quickly. For large mailboxes, the script may take a long time to run. It does not report on the progress while it’s running. If you are wondering whether the script is still doing anything, you can watch the “formail” process in Activity Monitor. To help determine what performance you should expect, here are some processing times from a 2012 Retina MacBook Pro:
- 1,394 messages (5.3 MB of mail) took 4 seconds, or about 348 messages (1.3 MB) per second.
- 22K messages (38 MB of mail) took 1.5 minutes, or about 244 messages (0.42 MB) per second.
- 83K messages (192 MB of mail) took 18 minutes, or about 77 messages (0.18 MB) per second.
- 200K messages (3.4 GB of mail) took 79 minutes, or about 42 messages (0.7 MB) per second.
Installation Instructions · Download in Compiled Format · Download in Text Format
Script
tell
application
"EagleFiler"
try
-- requires EagleFiler 1.7
set
_records
to
current records
of
browser window
1
on
error
set
_records
to
selected records
of
browser window
1
end
try
set
_mailboxes
to
{}
repeat
with
_record
in
_records
if
my
isMailboxRecord(
_record)
then
copy
_record
to
end
of
_mailboxes
my
removeDuplicatesFromMailboxRecord(
_record)
else
my
showAlert("Skipping File", "Skipping “" &
_record's
filename & "” because it is not a mailbox.")
end
if
end
repeat
if
_mailboxes
is
{}
then
my
showAlert("You did not select any mailboxes.", "Please click on Records or a folder in the source list (at the left) and then select one or more mailboxes in the records list (at the top-right).")
end
if
end
tell
on
isMailboxRecord(
_record)
tell
application
"EagleFiler"
if
_record's
universal type identifier
is
"com.c-command.mail.mbox"
then
return
true
return
false
end
tell
end
isMailboxRecord
on
removeDuplicatesFromMailboxRecord(
_record)
tell
application
"EagleFiler"
set
_file
to
_record's
file
set
_path
to
_file's
POSIX path
set
_filename
to
_record's
filename
set
{
_newPath,
_count}
to
my
removeDuplicatesFromPath(
_path)
if
_count
is
0
then
my
showAlert("No Duplicates Found", "There were no duplicate messages in “" &
_filename & "”.")
return
end
if
tell
_record's
library document
set
{
_newRecord}
to
import
files
{
_newPath}
my
copyMetadata(
_record,
_newRecord)
set
container
of
_record
to
trash
set
_newRecord's
filename
to
_filename
-- Wasn't possible to set it correctly earlier.
end
tell
my
showAlert("Duplicates Removed", "Removed " &
_count & " duplicate messages from “" &
_filename & "”.")
end
tell
end
removeDuplicatesFromMailboxRecord
on
removeDuplicatesFromPath(
_sourcePath)
set
_tempFolder
to
my
makeTemporaryFolder()
set
_cachePath
to
_tempFolder & "/" & "idcache"
set
_destPath
to
_tempFolder & "/" & "NewMailbox.mbox"
set
_logPath
to
_tempFolder & "/" & "Log.log"
-- cat Old | perl -p -e 's/\r\n/\n/g' | perl -p -e 's/\r/\n/g' | formail -b -e -q- -Y -D 104857600 idcache -s > New
set
_script
to
"cat " &
_sourcePath's
quoted form
set
_script
to
_script & " | perl -p -e 's/\\r\\n/\\n/g'"
set
_script
to
_script & " | perl -p -e 's/\\r/\\n/g'"
set
_script
to
_script & " | " &
my
formailPath() & " -b -e -q- -Y"
set
_script
to
_script & " -D 104857600 " &
_cachePath's
quoted form
-- 100 MB
set
_script
to
_script & " -s > " &
_destPath's
quoted form
set
_script
to
_script & " 2> " &
_logPath's
quoted form
with
timeout
of
24 * 60 * 60
seconds
do shell script
_script
end
timeout
set
_count
to
my
countDuplicatesFoundFromLogPath(
_logPath)
my
reportErrorsFromLogPath(
_logPath)
return
{
_destPath,
_count}
end
removeDuplicatesFromPath
on
countDuplicatesFoundFromLogPath(
_logPath)
set
_script
to
"grep -c \"^formail: Duplicate key found:\" " &
_logPath's
quoted form
try
set
_stdout
to
do shell script
_script
on
error
number
1
return
0
end
try
return
_stdout
as
number
end
countDuplicatesFoundFromLogPath
on
reportErrorsFromLogPath(
_logPath)
-- Skip lines starting with "<" becuase they are probably a wrapped message ID
set
_script
to
"grep -vEc \"^(\\s*<|formail: Duplicate key found:)\" " &
_logPath's
quoted form
try
set
_stdout
to
do shell script
_script
set
_errorCount
to
_stdout
as
number
on
error
set
_errorCount
to
0
end
try
if
_errorCount > 0
then
do shell script
"open -a Console " &
_logPath
set
_title
to
"Possible Errors Reported"
set
_message
to
(
_errorCount
as
string
) & " unexpected log entries were found when processing the mailbox. Please review them in Console."
with
timeout
of
24 * 60 * 60
seconds
display alert
_title
message
_message
buttons
{"Cancel", "Ignore the Errors"}
cancel button
1
end
timeout
end
if
end
reportErrorsFromLogPath
on
makeTemporaryFolder()
return
do shell script
"mktemp -d -t 'EFRemoveDuplicateMessages'"
end
makeTemporaryFolder
on
copyMetadata(
_source,
_dest)
tell
application
"EagleFiler"
set
_sourceURL
to
_source's
source URL
set
source URL
of
_dest
to
_sourceURL
set
container
of
_dest
to
_source's
container
set
_noteText
to
_source's
note text
set
note text
of
_dest
to
_noteText
set
_tags
to
_source's
assigned tags
set
assigned tags
of
_dest
to
_tags
set
_title
to
_source's
title
set
title
of
_dest
to
_title
set
_fromName
to
_source's
from name
set
from name
of
_dest
to
_fromName
set
_labelIndex
to
_source's
label index
set
label index
of
_dest
to
_labelIndex
set
_creationDate
to
_source's
creation date
set
creation date
of
_dest
to
_creationDate
set
_modificationDate
to
_source's
modification date
set
modification date
of
_dest
to
_modificationDate
end
tell
end
copyMetadata
on
showAlert(
_title,
_message)
with
timeout
of
24 * 60 * 60
seconds
display alert
_title
message
_message
buttons
{"Cancel", "OK"}
cancel button
1
end
timeout
end
showAlert
on
formailPath()
-- Prior to Mac OS X 10.11, it's also available at /usr/bin/formail.
set
_path
to
path to
application
"EagleFiler"
set
_posixPath
to
POSIX path
of
_path
return
_posixPath & "Contents/Frameworks/WashFramework.framework/Versions/A/formail"
end
formailPath