Commit d2ef843955101547a6d09ebaad331cac7b92f978
1 parent
9e61c6b8
Exists in
master
and in
39 other branches
Removing a few whitespaces
Showing
1 changed file
with
52 additions
and
52 deletions
Show diff stats
src/super_archives/management/commands/import_emails.py
| ... | ... | @@ -21,35 +21,35 @@ class Command(BaseCommand, object): |
| 21 | 21 | """Get emails from mailman archives and import them in the django db. """ |
| 22 | 22 | |
| 23 | 23 | help = __doc__ |
| 24 | - | |
| 24 | + | |
| 25 | 25 | default_archives_path = '/var/lib/mailman/archives/private' |
| 26 | 26 | RE_SUBJECT_CLEAN = re.compile('((re|res|fw|fwd|en|enc):)|\[.*?\]', |
| 27 | 27 | re.IGNORECASE) |
| 28 | 28 | THREAD_CACHE = {} |
| 29 | 29 | EMAIL_ADDR_CACHE = {} |
| 30 | - | |
| 30 | + | |
| 31 | 31 | # A new command line option to get the dump file to parse. |
| 32 | 32 | option_list = BaseCommand.option_list + ( |
| 33 | 33 | make_option('--archives_path', |
| 34 | 34 | dest='archives_path', |
| 35 | - help='Path of email archives to be imported. (default: %s)' % | |
| 35 | + help='Path of email archives to be imported. (default: %s)' % | |
| 36 | 36 | default_archives_path, |
| 37 | 37 | default=default_archives_path), |
| 38 | - | |
| 38 | + | |
| 39 | 39 | make_option('--exclude-list', |
| 40 | 40 | dest='exclude_lists', |
| 41 | - help=("Mailing list that won't be imported. It can be used many" | |
| 41 | + help=("Mailing list that won't be imported. It can be used many" | |
| 42 | 42 | "times for more than one list."), |
| 43 | 43 | action='append', |
| 44 | 44 | default=None), |
| 45 | - | |
| 45 | + | |
| 46 | 46 | make_option('--all', |
| 47 | 47 | dest='all', |
| 48 | 48 | help='Import all messages (default: False)', |
| 49 | 49 | action="store_true", |
| 50 | 50 | default=False), |
| 51 | 51 | ) |
| 52 | - | |
| 52 | + | |
| 53 | 53 | def __init__(self, *args, **kwargs): |
| 54 | 54 | super(Command, self).__init__(*args, **kwargs) |
| 55 | 55 | |
| ... | ... | @@ -68,18 +68,18 @@ class Command(BaseCommand, object): |
| 68 | 68 | |
| 69 | 69 | Yield: An instance of `mailbox.mboxMessage` for each email in the |
| 70 | 70 | file. |
| 71 | - | |
| 71 | + | |
| 72 | 72 | """ |
| 73 | 73 | self.log("Parsing email dump: %s." % email_filename) |
| 74 | 74 | mbox = mailbox.mbox(email_filename, factory=CustomMessage) |
| 75 | - | |
| 75 | + | |
| 76 | 76 | # Get each email from mbox file |
| 77 | 77 | # |
| 78 | 78 | # The following implementation was used because the object |
| 79 | - # mbox does not support slicing. Converting the object to a | |
| 80 | - # tuple (as represented in the code down here) was a valid | |
| 79 | + # mbox does not support slicing. Converting the object to a | |
| 80 | + # tuple (as represented in the code down here) was a valid | |
| 81 | 81 | # option but its performance was too poor. |
| 82 | - # | |
| 82 | + # | |
| 83 | 83 | #for message in tuple(mbox)[index:]: |
| 84 | 84 | # yield message |
| 85 | 85 | # |
| ... | ... | @@ -90,8 +90,8 @@ class Command(BaseCommand, object): |
| 90 | 90 | |
| 91 | 91 | def get_emails(self, mailinglist_dir, all, exclude_lists): |
| 92 | 92 | """Generator function that get the emails from each mailing |
| 93 | - list dump dirctory. If `all` is set to True all the emails in the | |
| 94 | - mbox will be imported if not it will just resume from the last | |
| 93 | + list dump dirctory. If `all` is set to True all the emails in the | |
| 94 | + mbox will be imported if not it will just resume from the last | |
| 95 | 95 | message previously imported. The lists set in `exclude_lists` |
| 96 | 96 | won't be imported. |
| 97 | 97 | |
| ... | ... | @@ -99,20 +99,20 @@ class Command(BaseCommand, object): |
| 99 | 99 | |
| 100 | 100 | """ |
| 101 | 101 | self.log("Getting emails dumps from: %s" % mailinglist_dir) |
| 102 | - | |
| 102 | + | |
| 103 | 103 | # Get the list of directories ending with .mbox |
| 104 | - mailing_lists_mboxes = (mbox for mbox in os.listdir(mailinglist_dir) | |
| 104 | + mailing_lists_mboxes = (mbox for mbox in os.listdir(mailinglist_dir) | |
| 105 | 105 | if mbox.endswith('.mbox')) |
| 106 | - | |
| 106 | + | |
| 107 | 107 | # Get messages from each mbox |
| 108 | 108 | for mbox in mailing_lists_mboxes: |
| 109 | 109 | mbox_path = os.path.join(mailinglist_dir, mbox, mbox) |
| 110 | 110 | mailinglist_name = mbox.split('.')[0] |
| 111 | - | |
| 111 | + | |
| 112 | 112 | # Check if the mailinglist is set not to be imported |
| 113 | 113 | if exclude_lists and mailinglist_name in exclude_lists: |
| 114 | 114 | continue |
| 115 | - | |
| 115 | + | |
| 116 | 116 | # Find the index of the last imported message |
| 117 | 117 | if all: |
| 118 | 118 | n_msgs = 0 |
| ... | ... | @@ -123,13 +123,13 @@ class Command(BaseCommand, object): |
| 123 | 123 | n_msgs = mailinglist.last_imported_index |
| 124 | 124 | except MailingList.DoesNotExist: |
| 125 | 125 | n_msgs = 0 |
| 126 | - | |
| 126 | + | |
| 127 | 127 | for index, msg in self.parse_emails(mbox_path, n_msgs): |
| 128 | 128 | yield mailinglist_name, msg, index |
| 129 | 129 | |
| 130 | 130 | def get_thread(self, email, mailinglist): |
| 131 | 131 | """Group messages by thread looking for similar subjects""" |
| 132 | - | |
| 132 | + | |
| 133 | 133 | subject_slug = slugify(email.subject_clean) |
| 134 | 134 | thread = self.THREAD_CACHE.get(subject_slug, {}).get(mailinglist.id) |
| 135 | 135 | if thread is None: |
| ... | ... | @@ -137,27 +137,27 @@ class Command(BaseCommand, object): |
| 137 | 137 | mailinglist=mailinglist, |
| 138 | 138 | subject_token=subject_slug |
| 139 | 139 | )[0] |
| 140 | - | |
| 140 | + | |
| 141 | 141 | if self.THREAD_CACHE.get(subject_slug) is None: |
| 142 | 142 | self.THREAD_CACHE[subject_slug] = dict() |
| 143 | 143 | self.THREAD_CACHE[subject_slug][mailinglist.id] = thread |
| 144 | 144 | |
| 145 | 145 | thread.latest_message = email |
| 146 | - thread.save() | |
| 146 | + thread.save() | |
| 147 | 147 | return thread |
| 148 | - | |
| 148 | + | |
| 149 | 149 | def save_email(self, list_name, email_msg, index): |
| 150 | 150 | """Save email message into the database.""" |
| 151 | - | |
| 151 | + | |
| 152 | 152 | # Update last imported message into the DB |
| 153 | 153 | mailinglist, created = MailingList.objects.get_or_create(name=list_name) |
| 154 | 154 | mailinglist.last_imported_index = index |
| 155 | - | |
| 156 | - if created: | |
| 155 | + | |
| 156 | + if created: | |
| 157 | 157 | # if the mailinglist is newly created it's sure that the message |
| 158 | 158 | # is not in the DB yet. |
| 159 | 159 | self.create_email(mailinglist, email_msg) |
| 160 | - | |
| 160 | + | |
| 161 | 161 | else: |
| 162 | 162 | # If the message is already at the database don't do anything |
| 163 | 163 | try: |
| ... | ... | @@ -165,11 +165,11 @@ class Command(BaseCommand, object): |
| 165 | 165 | message_id=email_msg.get('Message-ID'), |
| 166 | 166 | thread__mailinglist=mailinglist |
| 167 | 167 | ) |
| 168 | - | |
| 168 | + | |
| 169 | 169 | except Message.DoesNotExist: |
| 170 | 170 | self.create_email(mailinglist, email_msg) |
| 171 | - | |
| 172 | - mailinglist.save() | |
| 171 | + | |
| 172 | + mailinglist.save() | |
| 173 | 173 | |
| 174 | 174 | def create_email(self, mailinglist, email_msg): |
| 175 | 175 | |
| ... | ... | @@ -198,59 +198,59 @@ class Command(BaseCommand, object): |
| 198 | 198 | email.thread = self.get_thread(email, mailinglist) |
| 199 | 199 | email.save() |
| 200 | 200 | |
| 201 | - @transaction.commit_manually | |
| 201 | + @transaction.commit_manually | |
| 202 | 202 | def import_emails(self, archives_path, all, exclude_lists=None): |
| 203 | - """Get emails from the filesystem from the `archives_path` | |
| 204 | - and store them into the database. If `all` is set to True all | |
| 205 | - the filesystem storage will be imported otherwise the | |
| 206 | - importation will resume from the last message previously | |
| 203 | + """Get emails from the filesystem from the `archives_path` | |
| 204 | + and store them into the database. If `all` is set to True all | |
| 205 | + the filesystem storage will be imported otherwise the | |
| 206 | + importation will resume from the last message previously | |
| 207 | 207 | imported. The lists set in `exclude_lists` won't be imported. |
| 208 | - | |
| 208 | + | |
| 209 | 209 | """ |
| 210 | - | |
| 210 | + | |
| 211 | 211 | count = 0 |
| 212 | 212 | email_generator = self.get_emails(archives_path, all, exclude_lists) |
| 213 | 213 | for mailinglist_name, msg, index in email_generator: |
| 214 | 214 | try: |
| 215 | 215 | self.save_email(mailinglist_name, msg, index) |
| 216 | 216 | except: |
| 217 | - # This anti-pattern is needed to avoid the transations to | |
| 217 | + # This anti-pattern is needed to avoid the transations to | |
| 218 | 218 | # get stuck in case of errors. |
| 219 | 219 | transaction.rollback() |
| 220 | 220 | raise |
| 221 | - | |
| 221 | + | |
| 222 | 222 | count += 1 |
| 223 | 223 | if count % 1000 == 0: |
| 224 | 224 | transaction.commit() |
| 225 | - | |
| 225 | + | |
| 226 | 226 | transaction.commit() |
| 227 | - | |
| 227 | + | |
| 228 | 228 | def handle(self, *args, **options): |
| 229 | 229 | """Main command method.""" |
| 230 | - | |
| 230 | + | |
| 231 | 231 | lock_file = '/var/lock/colab/import_emails.lock' |
| 232 | - | |
| 232 | + | |
| 233 | 233 | # Already running, so quit |
| 234 | 234 | if os.path.exists(lock_file): |
| 235 | 235 | self.log(("This script is already running. (If your are sure it's " |
| 236 | 236 | "not please delete the lock file in %s')") % lock_file) |
| 237 | 237 | sys.exit(0) |
| 238 | - | |
| 238 | + | |
| 239 | 239 | if not os.path.exists(os.path.dirname(lock_file)): |
| 240 | 240 | os.mkdir(os.path.dirname(lock_file), 0755) |
| 241 | - | |
| 241 | + | |
| 242 | 242 | run_lock = file(lock_file, 'w') |
| 243 | 243 | run_lock.close() |
| 244 | - | |
| 244 | + | |
| 245 | 245 | archives_path = options.get('archives_path') |
| 246 | 246 | self.log('Using archives_path `%s`' % self.default_archives_path) |
| 247 | - | |
| 247 | + | |
| 248 | 248 | if not os.path.exists(archives_path): |
| 249 | 249 | raise CommandError('archives_path (%s) does not exist' % |
| 250 | 250 | archives_path) |
| 251 | - | |
| 252 | - self.import_emails(archives_path, | |
| 251 | + | |
| 252 | + self.import_emails(archives_path, | |
| 253 | 253 | options.get('all'), options.get('exclude_lists')) |
| 254 | - | |
| 254 | + | |
| 255 | 255 | os.remove(lock_file) |
| 256 | - | |
| 256 | + | ... | ... |