Commit d2ef843955101547a6d09ebaad331cac7b92f978
1 parent
9e61c6b8
Exists in
master
and in
39 other branches
Removing a few whitespaces
Showing
1 changed file
with
52 additions
and
52 deletions
Show diff stats
src/super_archives/management/commands/import_emails.py
... | ... | @@ -21,35 +21,35 @@ class Command(BaseCommand, object): |
21 | 21 | """Get emails from mailman archives and import them in the django db. """ |
22 | 22 | |
23 | 23 | help = __doc__ |
24 | - | |
24 | + | |
25 | 25 | default_archives_path = '/var/lib/mailman/archives/private' |
26 | 26 | RE_SUBJECT_CLEAN = re.compile('((re|res|fw|fwd|en|enc):)|\[.*?\]', |
27 | 27 | re.IGNORECASE) |
28 | 28 | THREAD_CACHE = {} |
29 | 29 | EMAIL_ADDR_CACHE = {} |
30 | - | |
30 | + | |
31 | 31 | # A new command line option to get the dump file to parse. |
32 | 32 | option_list = BaseCommand.option_list + ( |
33 | 33 | make_option('--archives_path', |
34 | 34 | dest='archives_path', |
35 | - help='Path of email archives to be imported. (default: %s)' % | |
35 | + help='Path of email archives to be imported. (default: %s)' % | |
36 | 36 | default_archives_path, |
37 | 37 | default=default_archives_path), |
38 | - | |
38 | + | |
39 | 39 | make_option('--exclude-list', |
40 | 40 | dest='exclude_lists', |
41 | - help=("Mailing list that won't be imported. It can be used many" | |
41 | + help=("Mailing list that won't be imported. It can be used many" | |
42 | 42 | "times for more than one list."), |
43 | 43 | action='append', |
44 | 44 | default=None), |
45 | - | |
45 | + | |
46 | 46 | make_option('--all', |
47 | 47 | dest='all', |
48 | 48 | help='Import all messages (default: False)', |
49 | 49 | action="store_true", |
50 | 50 | default=False), |
51 | 51 | ) |
52 | - | |
52 | + | |
53 | 53 | def __init__(self, *args, **kwargs): |
54 | 54 | super(Command, self).__init__(*args, **kwargs) |
55 | 55 | |
... | ... | @@ -68,18 +68,18 @@ class Command(BaseCommand, object): |
68 | 68 | |
69 | 69 | Yield: An instance of `mailbox.mboxMessage` for each email in the |
70 | 70 | file. |
71 | - | |
71 | + | |
72 | 72 | """ |
73 | 73 | self.log("Parsing email dump: %s." % email_filename) |
74 | 74 | mbox = mailbox.mbox(email_filename, factory=CustomMessage) |
75 | - | |
75 | + | |
76 | 76 | # Get each email from mbox file |
77 | 77 | # |
78 | 78 | # The following implementation was used because the object |
79 | - # mbox does not support slicing. Converting the object to a | |
80 | - # tuple (as represented in the code down here) was a valid | |
79 | + # mbox does not support slicing. Converting the object to a | |
80 | + # tuple (as represented in the code down here) was a valid | |
81 | 81 | # option but its performance was too poor. |
82 | - # | |
82 | + # | |
83 | 83 | #for message in tuple(mbox)[index:]: |
84 | 84 | # yield message |
85 | 85 | # |
... | ... | @@ -90,8 +90,8 @@ class Command(BaseCommand, object): |
90 | 90 | |
91 | 91 | def get_emails(self, mailinglist_dir, all, exclude_lists): |
92 | 92 | """Generator function that get the emails from each mailing |
93 | - list dump dirctory. If `all` is set to True all the emails in the | |
94 | - mbox will be imported if not it will just resume from the last | |
93 | + list dump dirctory. If `all` is set to True all the emails in the | |
94 | + mbox will be imported if not it will just resume from the last | |
95 | 95 | message previously imported. The lists set in `exclude_lists` |
96 | 96 | won't be imported. |
97 | 97 | |
... | ... | @@ -99,20 +99,20 @@ class Command(BaseCommand, object): |
99 | 99 | |
100 | 100 | """ |
101 | 101 | self.log("Getting emails dumps from: %s" % mailinglist_dir) |
102 | - | |
102 | + | |
103 | 103 | # Get the list of directories ending with .mbox |
104 | - mailing_lists_mboxes = (mbox for mbox in os.listdir(mailinglist_dir) | |
104 | + mailing_lists_mboxes = (mbox for mbox in os.listdir(mailinglist_dir) | |
105 | 105 | if mbox.endswith('.mbox')) |
106 | - | |
106 | + | |
107 | 107 | # Get messages from each mbox |
108 | 108 | for mbox in mailing_lists_mboxes: |
109 | 109 | mbox_path = os.path.join(mailinglist_dir, mbox, mbox) |
110 | 110 | mailinglist_name = mbox.split('.')[0] |
111 | - | |
111 | + | |
112 | 112 | # Check if the mailinglist is set not to be imported |
113 | 113 | if exclude_lists and mailinglist_name in exclude_lists: |
114 | 114 | continue |
115 | - | |
115 | + | |
116 | 116 | # Find the index of the last imported message |
117 | 117 | if all: |
118 | 118 | n_msgs = 0 |
... | ... | @@ -123,13 +123,13 @@ class Command(BaseCommand, object): |
123 | 123 | n_msgs = mailinglist.last_imported_index |
124 | 124 | except MailingList.DoesNotExist: |
125 | 125 | n_msgs = 0 |
126 | - | |
126 | + | |
127 | 127 | for index, msg in self.parse_emails(mbox_path, n_msgs): |
128 | 128 | yield mailinglist_name, msg, index |
129 | 129 | |
130 | 130 | def get_thread(self, email, mailinglist): |
131 | 131 | """Group messages by thread looking for similar subjects""" |
132 | - | |
132 | + | |
133 | 133 | subject_slug = slugify(email.subject_clean) |
134 | 134 | thread = self.THREAD_CACHE.get(subject_slug, {}).get(mailinglist.id) |
135 | 135 | if thread is None: |
... | ... | @@ -137,27 +137,27 @@ class Command(BaseCommand, object): |
137 | 137 | mailinglist=mailinglist, |
138 | 138 | subject_token=subject_slug |
139 | 139 | )[0] |
140 | - | |
140 | + | |
141 | 141 | if self.THREAD_CACHE.get(subject_slug) is None: |
142 | 142 | self.THREAD_CACHE[subject_slug] = dict() |
143 | 143 | self.THREAD_CACHE[subject_slug][mailinglist.id] = thread |
144 | 144 | |
145 | 145 | thread.latest_message = email |
146 | - thread.save() | |
146 | + thread.save() | |
147 | 147 | return thread |
148 | - | |
148 | + | |
149 | 149 | def save_email(self, list_name, email_msg, index): |
150 | 150 | """Save email message into the database.""" |
151 | - | |
151 | + | |
152 | 152 | # Update last imported message into the DB |
153 | 153 | mailinglist, created = MailingList.objects.get_or_create(name=list_name) |
154 | 154 | mailinglist.last_imported_index = index |
155 | - | |
156 | - if created: | |
155 | + | |
156 | + if created: | |
157 | 157 | # if the mailinglist is newly created it's sure that the message |
158 | 158 | # is not in the DB yet. |
159 | 159 | self.create_email(mailinglist, email_msg) |
160 | - | |
160 | + | |
161 | 161 | else: |
162 | 162 | # If the message is already at the database don't do anything |
163 | 163 | try: |
... | ... | @@ -165,11 +165,11 @@ class Command(BaseCommand, object): |
165 | 165 | message_id=email_msg.get('Message-ID'), |
166 | 166 | thread__mailinglist=mailinglist |
167 | 167 | ) |
168 | - | |
168 | + | |
169 | 169 | except Message.DoesNotExist: |
170 | 170 | self.create_email(mailinglist, email_msg) |
171 | - | |
172 | - mailinglist.save() | |
171 | + | |
172 | + mailinglist.save() | |
173 | 173 | |
174 | 174 | def create_email(self, mailinglist, email_msg): |
175 | 175 | |
... | ... | @@ -198,59 +198,59 @@ class Command(BaseCommand, object): |
198 | 198 | email.thread = self.get_thread(email, mailinglist) |
199 | 199 | email.save() |
200 | 200 | |
201 | - @transaction.commit_manually | |
201 | + @transaction.commit_manually | |
202 | 202 | def import_emails(self, archives_path, all, exclude_lists=None): |
203 | - """Get emails from the filesystem from the `archives_path` | |
204 | - and store them into the database. If `all` is set to True all | |
205 | - the filesystem storage will be imported otherwise the | |
206 | - importation will resume from the last message previously | |
203 | + """Get emails from the filesystem from the `archives_path` | |
204 | + and store them into the database. If `all` is set to True all | |
205 | + the filesystem storage will be imported otherwise the | |
206 | + importation will resume from the last message previously | |
207 | 207 | imported. The lists set in `exclude_lists` won't be imported. |
208 | - | |
208 | + | |
209 | 209 | """ |
210 | - | |
210 | + | |
211 | 211 | count = 0 |
212 | 212 | email_generator = self.get_emails(archives_path, all, exclude_lists) |
213 | 213 | for mailinglist_name, msg, index in email_generator: |
214 | 214 | try: |
215 | 215 | self.save_email(mailinglist_name, msg, index) |
216 | 216 | except: |
217 | - # This anti-pattern is needed to avoid the transations to | |
217 | + # This anti-pattern is needed to avoid the transations to | |
218 | 218 | # get stuck in case of errors. |
219 | 219 | transaction.rollback() |
220 | 220 | raise |
221 | - | |
221 | + | |
222 | 222 | count += 1 |
223 | 223 | if count % 1000 == 0: |
224 | 224 | transaction.commit() |
225 | - | |
225 | + | |
226 | 226 | transaction.commit() |
227 | - | |
227 | + | |
228 | 228 | def handle(self, *args, **options): |
229 | 229 | """Main command method.""" |
230 | - | |
230 | + | |
231 | 231 | lock_file = '/var/lock/colab/import_emails.lock' |
232 | - | |
232 | + | |
233 | 233 | # Already running, so quit |
234 | 234 | if os.path.exists(lock_file): |
235 | 235 | self.log(("This script is already running. (If your are sure it's " |
236 | 236 | "not please delete the lock file in %s')") % lock_file) |
237 | 237 | sys.exit(0) |
238 | - | |
238 | + | |
239 | 239 | if not os.path.exists(os.path.dirname(lock_file)): |
240 | 240 | os.mkdir(os.path.dirname(lock_file), 0755) |
241 | - | |
241 | + | |
242 | 242 | run_lock = file(lock_file, 'w') |
243 | 243 | run_lock.close() |
244 | - | |
244 | + | |
245 | 245 | archives_path = options.get('archives_path') |
246 | 246 | self.log('Using archives_path `%s`' % self.default_archives_path) |
247 | - | |
247 | + | |
248 | 248 | if not os.path.exists(archives_path): |
249 | 249 | raise CommandError('archives_path (%s) does not exist' % |
250 | 250 | archives_path) |
251 | - | |
252 | - self.import_emails(archives_path, | |
251 | + | |
252 | + self.import_emails(archives_path, | |
253 | 253 | options.get('all'), options.get('exclude_lists')) |
254 | - | |
254 | + | |
255 | 255 | os.remove(lock_file) |
256 | - | |
256 | + | ... | ... |