Commit d2ef843955101547a6d09ebaad331cac7b92f978
1 parent
9e61c6b8
Exists in
master
and in
39 other branches
Removing a few whitespaces
Showing
1 changed file
with
52 additions
and
52 deletions
Show diff stats
src/super_archives/management/commands/import_emails.py
@@ -21,35 +21,35 @@ class Command(BaseCommand, object): | @@ -21,35 +21,35 @@ class Command(BaseCommand, object): | ||
21 | """Get emails from mailman archives and import them in the django db. """ | 21 | """Get emails from mailman archives and import them in the django db. """ |
22 | 22 | ||
23 | help = __doc__ | 23 | help = __doc__ |
24 | - | 24 | + |
25 | default_archives_path = '/var/lib/mailman/archives/private' | 25 | default_archives_path = '/var/lib/mailman/archives/private' |
26 | RE_SUBJECT_CLEAN = re.compile('((re|res|fw|fwd|en|enc):)|\[.*?\]', | 26 | RE_SUBJECT_CLEAN = re.compile('((re|res|fw|fwd|en|enc):)|\[.*?\]', |
27 | re.IGNORECASE) | 27 | re.IGNORECASE) |
28 | THREAD_CACHE = {} | 28 | THREAD_CACHE = {} |
29 | EMAIL_ADDR_CACHE = {} | 29 | EMAIL_ADDR_CACHE = {} |
30 | - | 30 | + |
31 | # A new command line option to get the dump file to parse. | 31 | # A new command line option to get the dump file to parse. |
32 | option_list = BaseCommand.option_list + ( | 32 | option_list = BaseCommand.option_list + ( |
33 | make_option('--archives_path', | 33 | make_option('--archives_path', |
34 | dest='archives_path', | 34 | dest='archives_path', |
35 | - help='Path of email archives to be imported. (default: %s)' % | 35 | + help='Path of email archives to be imported. (default: %s)' % |
36 | default_archives_path, | 36 | default_archives_path, |
37 | default=default_archives_path), | 37 | default=default_archives_path), |
38 | - | 38 | + |
39 | make_option('--exclude-list', | 39 | make_option('--exclude-list', |
40 | dest='exclude_lists', | 40 | dest='exclude_lists', |
41 | - help=("Mailing list that won't be imported. It can be used many" | 41 | + help=("Mailing list that won't be imported. It can be used many" |
42 | "times for more than one list."), | 42 | "times for more than one list."), |
43 | action='append', | 43 | action='append', |
44 | default=None), | 44 | default=None), |
45 | - | 45 | + |
46 | make_option('--all', | 46 | make_option('--all', |
47 | dest='all', | 47 | dest='all', |
48 | help='Import all messages (default: False)', | 48 | help='Import all messages (default: False)', |
49 | action="store_true", | 49 | action="store_true", |
50 | default=False), | 50 | default=False), |
51 | ) | 51 | ) |
52 | - | 52 | + |
53 | def __init__(self, *args, **kwargs): | 53 | def __init__(self, *args, **kwargs): |
54 | super(Command, self).__init__(*args, **kwargs) | 54 | super(Command, self).__init__(*args, **kwargs) |
55 | 55 | ||
@@ -68,18 +68,18 @@ class Command(BaseCommand, object): | @@ -68,18 +68,18 @@ class Command(BaseCommand, object): | ||
68 | 68 | ||
69 | Yield: An instance of `mailbox.mboxMessage` for each email in the | 69 | Yield: An instance of `mailbox.mboxMessage` for each email in the |
70 | file. | 70 | file. |
71 | - | 71 | + |
72 | """ | 72 | """ |
73 | self.log("Parsing email dump: %s." % email_filename) | 73 | self.log("Parsing email dump: %s." % email_filename) |
74 | mbox = mailbox.mbox(email_filename, factory=CustomMessage) | 74 | mbox = mailbox.mbox(email_filename, factory=CustomMessage) |
75 | - | 75 | + |
76 | # Get each email from mbox file | 76 | # Get each email from mbox file |
77 | # | 77 | # |
78 | # The following implementation was used because the object | 78 | # The following implementation was used because the object |
79 | - # mbox does not support slicing. Converting the object to a | ||
80 | - # tuple (as represented in the code down here) was a valid | 79 | + # mbox does not support slicing. Converting the object to a |
80 | + # tuple (as represented in the code down here) was a valid | ||
81 | # option but its performance was too poor. | 81 | # option but its performance was too poor. |
82 | - # | 82 | + # |
83 | #for message in tuple(mbox)[index:]: | 83 | #for message in tuple(mbox)[index:]: |
84 | # yield message | 84 | # yield message |
85 | # | 85 | # |
@@ -90,8 +90,8 @@ class Command(BaseCommand, object): | @@ -90,8 +90,8 @@ class Command(BaseCommand, object): | ||
90 | 90 | ||
91 | def get_emails(self, mailinglist_dir, all, exclude_lists): | 91 | def get_emails(self, mailinglist_dir, all, exclude_lists): |
92 | """Generator function that get the emails from each mailing | 92 | """Generator function that get the emails from each mailing |
93 | - list dump dirctory. If `all` is set to True all the emails in the | ||
94 | - mbox will be imported if not it will just resume from the last | 93 | + list dump dirctory. If `all` is set to True all the emails in the |
94 | + mbox will be imported if not it will just resume from the last | ||
95 | message previously imported. The lists set in `exclude_lists` | 95 | message previously imported. The lists set in `exclude_lists` |
96 | won't be imported. | 96 | won't be imported. |
97 | 97 | ||
@@ -99,20 +99,20 @@ class Command(BaseCommand, object): | @@ -99,20 +99,20 @@ class Command(BaseCommand, object): | ||
99 | 99 | ||
100 | """ | 100 | """ |
101 | self.log("Getting emails dumps from: %s" % mailinglist_dir) | 101 | self.log("Getting emails dumps from: %s" % mailinglist_dir) |
102 | - | 102 | + |
103 | # Get the list of directories ending with .mbox | 103 | # Get the list of directories ending with .mbox |
104 | - mailing_lists_mboxes = (mbox for mbox in os.listdir(mailinglist_dir) | 104 | + mailing_lists_mboxes = (mbox for mbox in os.listdir(mailinglist_dir) |
105 | if mbox.endswith('.mbox')) | 105 | if mbox.endswith('.mbox')) |
106 | - | 106 | + |
107 | # Get messages from each mbox | 107 | # Get messages from each mbox |
108 | for mbox in mailing_lists_mboxes: | 108 | for mbox in mailing_lists_mboxes: |
109 | mbox_path = os.path.join(mailinglist_dir, mbox, mbox) | 109 | mbox_path = os.path.join(mailinglist_dir, mbox, mbox) |
110 | mailinglist_name = mbox.split('.')[0] | 110 | mailinglist_name = mbox.split('.')[0] |
111 | - | 111 | + |
112 | # Check if the mailinglist is set not to be imported | 112 | # Check if the mailinglist is set not to be imported |
113 | if exclude_lists and mailinglist_name in exclude_lists: | 113 | if exclude_lists and mailinglist_name in exclude_lists: |
114 | continue | 114 | continue |
115 | - | 115 | + |
116 | # Find the index of the last imported message | 116 | # Find the index of the last imported message |
117 | if all: | 117 | if all: |
118 | n_msgs = 0 | 118 | n_msgs = 0 |
@@ -123,13 +123,13 @@ class Command(BaseCommand, object): | @@ -123,13 +123,13 @@ class Command(BaseCommand, object): | ||
123 | n_msgs = mailinglist.last_imported_index | 123 | n_msgs = mailinglist.last_imported_index |
124 | except MailingList.DoesNotExist: | 124 | except MailingList.DoesNotExist: |
125 | n_msgs = 0 | 125 | n_msgs = 0 |
126 | - | 126 | + |
127 | for index, msg in self.parse_emails(mbox_path, n_msgs): | 127 | for index, msg in self.parse_emails(mbox_path, n_msgs): |
128 | yield mailinglist_name, msg, index | 128 | yield mailinglist_name, msg, index |
129 | 129 | ||
130 | def get_thread(self, email, mailinglist): | 130 | def get_thread(self, email, mailinglist): |
131 | """Group messages by thread looking for similar subjects""" | 131 | """Group messages by thread looking for similar subjects""" |
132 | - | 132 | + |
133 | subject_slug = slugify(email.subject_clean) | 133 | subject_slug = slugify(email.subject_clean) |
134 | thread = self.THREAD_CACHE.get(subject_slug, {}).get(mailinglist.id) | 134 | thread = self.THREAD_CACHE.get(subject_slug, {}).get(mailinglist.id) |
135 | if thread is None: | 135 | if thread is None: |
@@ -137,27 +137,27 @@ class Command(BaseCommand, object): | @@ -137,27 +137,27 @@ class Command(BaseCommand, object): | ||
137 | mailinglist=mailinglist, | 137 | mailinglist=mailinglist, |
138 | subject_token=subject_slug | 138 | subject_token=subject_slug |
139 | )[0] | 139 | )[0] |
140 | - | 140 | + |
141 | if self.THREAD_CACHE.get(subject_slug) is None: | 141 | if self.THREAD_CACHE.get(subject_slug) is None: |
142 | self.THREAD_CACHE[subject_slug] = dict() | 142 | self.THREAD_CACHE[subject_slug] = dict() |
143 | self.THREAD_CACHE[subject_slug][mailinglist.id] = thread | 143 | self.THREAD_CACHE[subject_slug][mailinglist.id] = thread |
144 | 144 | ||
145 | thread.latest_message = email | 145 | thread.latest_message = email |
146 | - thread.save() | 146 | + thread.save() |
147 | return thread | 147 | return thread |
148 | - | 148 | + |
149 | def save_email(self, list_name, email_msg, index): | 149 | def save_email(self, list_name, email_msg, index): |
150 | """Save email message into the database.""" | 150 | """Save email message into the database.""" |
151 | - | 151 | + |
152 | # Update last imported message into the DB | 152 | # Update last imported message into the DB |
153 | mailinglist, created = MailingList.objects.get_or_create(name=list_name) | 153 | mailinglist, created = MailingList.objects.get_or_create(name=list_name) |
154 | mailinglist.last_imported_index = index | 154 | mailinglist.last_imported_index = index |
155 | - | ||
156 | - if created: | 155 | + |
156 | + if created: | ||
157 | # if the mailinglist is newly created it's sure that the message | 157 | # if the mailinglist is newly created it's sure that the message |
158 | # is not in the DB yet. | 158 | # is not in the DB yet. |
159 | self.create_email(mailinglist, email_msg) | 159 | self.create_email(mailinglist, email_msg) |
160 | - | 160 | + |
161 | else: | 161 | else: |
162 | # If the message is already at the database don't do anything | 162 | # If the message is already at the database don't do anything |
163 | try: | 163 | try: |
@@ -165,11 +165,11 @@ class Command(BaseCommand, object): | @@ -165,11 +165,11 @@ class Command(BaseCommand, object): | ||
165 | message_id=email_msg.get('Message-ID'), | 165 | message_id=email_msg.get('Message-ID'), |
166 | thread__mailinglist=mailinglist | 166 | thread__mailinglist=mailinglist |
167 | ) | 167 | ) |
168 | - | 168 | + |
169 | except Message.DoesNotExist: | 169 | except Message.DoesNotExist: |
170 | self.create_email(mailinglist, email_msg) | 170 | self.create_email(mailinglist, email_msg) |
171 | - | ||
172 | - mailinglist.save() | 171 | + |
172 | + mailinglist.save() | ||
173 | 173 | ||
174 | def create_email(self, mailinglist, email_msg): | 174 | def create_email(self, mailinglist, email_msg): |
175 | 175 | ||
@@ -198,59 +198,59 @@ class Command(BaseCommand, object): | @@ -198,59 +198,59 @@ class Command(BaseCommand, object): | ||
198 | email.thread = self.get_thread(email, mailinglist) | 198 | email.thread = self.get_thread(email, mailinglist) |
199 | email.save() | 199 | email.save() |
200 | 200 | ||
201 | - @transaction.commit_manually | 201 | + @transaction.commit_manually |
202 | def import_emails(self, archives_path, all, exclude_lists=None): | 202 | def import_emails(self, archives_path, all, exclude_lists=None): |
203 | - """Get emails from the filesystem from the `archives_path` | ||
204 | - and store them into the database. If `all` is set to True all | ||
205 | - the filesystem storage will be imported otherwise the | ||
206 | - importation will resume from the last message previously | 203 | + """Get emails from the filesystem from the `archives_path` |
204 | + and store them into the database. If `all` is set to True all | ||
205 | + the filesystem storage will be imported otherwise the | ||
206 | + importation will resume from the last message previously | ||
207 | imported. The lists set in `exclude_lists` won't be imported. | 207 | imported. The lists set in `exclude_lists` won't be imported. |
208 | - | 208 | + |
209 | """ | 209 | """ |
210 | - | 210 | + |
211 | count = 0 | 211 | count = 0 |
212 | email_generator = self.get_emails(archives_path, all, exclude_lists) | 212 | email_generator = self.get_emails(archives_path, all, exclude_lists) |
213 | for mailinglist_name, msg, index in email_generator: | 213 | for mailinglist_name, msg, index in email_generator: |
214 | try: | 214 | try: |
215 | self.save_email(mailinglist_name, msg, index) | 215 | self.save_email(mailinglist_name, msg, index) |
216 | except: | 216 | except: |
217 | - # This anti-pattern is needed to avoid the transations to | 217 | + # This anti-pattern is needed to avoid the transations to |
218 | # get stuck in case of errors. | 218 | # get stuck in case of errors. |
219 | transaction.rollback() | 219 | transaction.rollback() |
220 | raise | 220 | raise |
221 | - | 221 | + |
222 | count += 1 | 222 | count += 1 |
223 | if count % 1000 == 0: | 223 | if count % 1000 == 0: |
224 | transaction.commit() | 224 | transaction.commit() |
225 | - | 225 | + |
226 | transaction.commit() | 226 | transaction.commit() |
227 | - | 227 | + |
228 | def handle(self, *args, **options): | 228 | def handle(self, *args, **options): |
229 | """Main command method.""" | 229 | """Main command method.""" |
230 | - | 230 | + |
231 | lock_file = '/var/lock/colab/import_emails.lock' | 231 | lock_file = '/var/lock/colab/import_emails.lock' |
232 | - | 232 | + |
233 | # Already running, so quit | 233 | # Already running, so quit |
234 | if os.path.exists(lock_file): | 234 | if os.path.exists(lock_file): |
235 | self.log(("This script is already running. (If your are sure it's " | 235 | self.log(("This script is already running. (If your are sure it's " |
236 | "not please delete the lock file in %s')") % lock_file) | 236 | "not please delete the lock file in %s')") % lock_file) |
237 | sys.exit(0) | 237 | sys.exit(0) |
238 | - | 238 | + |
239 | if not os.path.exists(os.path.dirname(lock_file)): | 239 | if not os.path.exists(os.path.dirname(lock_file)): |
240 | os.mkdir(os.path.dirname(lock_file), 0755) | 240 | os.mkdir(os.path.dirname(lock_file), 0755) |
241 | - | 241 | + |
242 | run_lock = file(lock_file, 'w') | 242 | run_lock = file(lock_file, 'w') |
243 | run_lock.close() | 243 | run_lock.close() |
244 | - | 244 | + |
245 | archives_path = options.get('archives_path') | 245 | archives_path = options.get('archives_path') |
246 | self.log('Using archives_path `%s`' % self.default_archives_path) | 246 | self.log('Using archives_path `%s`' % self.default_archives_path) |
247 | - | 247 | + |
248 | if not os.path.exists(archives_path): | 248 | if not os.path.exists(archives_path): |
249 | raise CommandError('archives_path (%s) does not exist' % | 249 | raise CommandError('archives_path (%s) does not exist' % |
250 | archives_path) | 250 | archives_path) |
251 | - | ||
252 | - self.import_emails(archives_path, | 251 | + |
252 | + self.import_emails(archives_path, | ||
253 | options.get('all'), options.get('exclude_lists')) | 253 | options.get('all'), options.get('exclude_lists')) |
254 | - | 254 | + |
255 | os.remove(lock_file) | 255 | os.remove(lock_file) |
256 | - | 256 | + |