Commit d2ef843955101547a6d09ebaad331cac7b92f978

Authored by Luan
1 parent 9e61c6b8

Removing a few whitespaces

src/super_archives/management/commands/import_emails.py
@@ -21,35 +21,35 @@ class Command(BaseCommand, object): @@ -21,35 +21,35 @@ class Command(BaseCommand, object):
21 """Get emails from mailman archives and import them in the django db. """ 21 """Get emails from mailman archives and import them in the django db. """
22 22
23 help = __doc__ 23 help = __doc__
24 - 24 +
25 default_archives_path = '/var/lib/mailman/archives/private' 25 default_archives_path = '/var/lib/mailman/archives/private'
26 RE_SUBJECT_CLEAN = re.compile('((re|res|fw|fwd|en|enc):)|\[.*?\]', 26 RE_SUBJECT_CLEAN = re.compile('((re|res|fw|fwd|en|enc):)|\[.*?\]',
27 re.IGNORECASE) 27 re.IGNORECASE)
28 THREAD_CACHE = {} 28 THREAD_CACHE = {}
29 EMAIL_ADDR_CACHE = {} 29 EMAIL_ADDR_CACHE = {}
30 - 30 +
31 # A new command line option to get the dump file to parse. 31 # A new command line option to get the dump file to parse.
32 option_list = BaseCommand.option_list + ( 32 option_list = BaseCommand.option_list + (
33 make_option('--archives_path', 33 make_option('--archives_path',
34 dest='archives_path', 34 dest='archives_path',
35 - help='Path of email archives to be imported. (default: %s)' % 35 + help='Path of email archives to be imported. (default: %s)' %
36 default_archives_path, 36 default_archives_path,
37 default=default_archives_path), 37 default=default_archives_path),
38 - 38 +
39 make_option('--exclude-list', 39 make_option('--exclude-list',
40 dest='exclude_lists', 40 dest='exclude_lists',
41 - help=("Mailing list that won't be imported. It can be used many" 41 + help=("Mailing list that won't be imported. It can be used many"
42 "times for more than one list."), 42 "times for more than one list."),
43 action='append', 43 action='append',
44 default=None), 44 default=None),
45 - 45 +
46 make_option('--all', 46 make_option('--all',
47 dest='all', 47 dest='all',
48 help='Import all messages (default: False)', 48 help='Import all messages (default: False)',
49 action="store_true", 49 action="store_true",
50 default=False), 50 default=False),
51 ) 51 )
52 - 52 +
53 def __init__(self, *args, **kwargs): 53 def __init__(self, *args, **kwargs):
54 super(Command, self).__init__(*args, **kwargs) 54 super(Command, self).__init__(*args, **kwargs)
55 55
@@ -68,18 +68,18 @@ class Command(BaseCommand, object): @@ -68,18 +68,18 @@ class Command(BaseCommand, object):
68 68
69 Yield: An instance of `mailbox.mboxMessage` for each email in the 69 Yield: An instance of `mailbox.mboxMessage` for each email in the
70 file. 70 file.
71 - 71 +
72 """ 72 """
73 self.log("Parsing email dump: %s." % email_filename) 73 self.log("Parsing email dump: %s." % email_filename)
74 mbox = mailbox.mbox(email_filename, factory=CustomMessage) 74 mbox = mailbox.mbox(email_filename, factory=CustomMessage)
75 - 75 +
76 # Get each email from mbox file 76 # Get each email from mbox file
77 # 77 #
78 # The following implementation was used because the object 78 # The following implementation was used because the object
79 - # mbox does not support slicing. Converting the object to a  
80 - # tuple (as represented in the code down here) was a valid 79 + # mbox does not support slicing. Converting the object to a
  80 + # tuple (as represented in the code down here) was a valid
81 # option but its performance was too poor. 81 # option but its performance was too poor.
82 - # 82 + #
83 #for message in tuple(mbox)[index:]: 83 #for message in tuple(mbox)[index:]:
84 # yield message 84 # yield message
85 # 85 #
@@ -90,8 +90,8 @@ class Command(BaseCommand, object): @@ -90,8 +90,8 @@ class Command(BaseCommand, object):
90 90
91 def get_emails(self, mailinglist_dir, all, exclude_lists): 91 def get_emails(self, mailinglist_dir, all, exclude_lists):
92 """Generator function that get the emails from each mailing 92 """Generator function that get the emails from each mailing
93 - list dump dirctory. If `all` is set to True all the emails in the  
94 - mbox will be imported if not it will just resume from the last 93 + list dump dirctory. If `all` is set to True all the emails in the
  94 + mbox will be imported if not it will just resume from the last
95 message previously imported. The lists set in `exclude_lists` 95 message previously imported. The lists set in `exclude_lists`
96 won't be imported. 96 won't be imported.
97 97
@@ -99,20 +99,20 @@ class Command(BaseCommand, object): @@ -99,20 +99,20 @@ class Command(BaseCommand, object):
99 99
100 """ 100 """
101 self.log("Getting emails dumps from: %s" % mailinglist_dir) 101 self.log("Getting emails dumps from: %s" % mailinglist_dir)
102 - 102 +
103 # Get the list of directories ending with .mbox 103 # Get the list of directories ending with .mbox
104 - mailing_lists_mboxes = (mbox for mbox in os.listdir(mailinglist_dir) 104 + mailing_lists_mboxes = (mbox for mbox in os.listdir(mailinglist_dir)
105 if mbox.endswith('.mbox')) 105 if mbox.endswith('.mbox'))
106 - 106 +
107 # Get messages from each mbox 107 # Get messages from each mbox
108 for mbox in mailing_lists_mboxes: 108 for mbox in mailing_lists_mboxes:
109 mbox_path = os.path.join(mailinglist_dir, mbox, mbox) 109 mbox_path = os.path.join(mailinglist_dir, mbox, mbox)
110 mailinglist_name = mbox.split('.')[0] 110 mailinglist_name = mbox.split('.')[0]
111 - 111 +
112 # Check if the mailinglist is set not to be imported 112 # Check if the mailinglist is set not to be imported
113 if exclude_lists and mailinglist_name in exclude_lists: 113 if exclude_lists and mailinglist_name in exclude_lists:
114 continue 114 continue
115 - 115 +
116 # Find the index of the last imported message 116 # Find the index of the last imported message
117 if all: 117 if all:
118 n_msgs = 0 118 n_msgs = 0
@@ -123,13 +123,13 @@ class Command(BaseCommand, object): @@ -123,13 +123,13 @@ class Command(BaseCommand, object):
123 n_msgs = mailinglist.last_imported_index 123 n_msgs = mailinglist.last_imported_index
124 except MailingList.DoesNotExist: 124 except MailingList.DoesNotExist:
125 n_msgs = 0 125 n_msgs = 0
126 - 126 +
127 for index, msg in self.parse_emails(mbox_path, n_msgs): 127 for index, msg in self.parse_emails(mbox_path, n_msgs):
128 yield mailinglist_name, msg, index 128 yield mailinglist_name, msg, index
129 129
130 def get_thread(self, email, mailinglist): 130 def get_thread(self, email, mailinglist):
131 """Group messages by thread looking for similar subjects""" 131 """Group messages by thread looking for similar subjects"""
132 - 132 +
133 subject_slug = slugify(email.subject_clean) 133 subject_slug = slugify(email.subject_clean)
134 thread = self.THREAD_CACHE.get(subject_slug, {}).get(mailinglist.id) 134 thread = self.THREAD_CACHE.get(subject_slug, {}).get(mailinglist.id)
135 if thread is None: 135 if thread is None:
@@ -137,27 +137,27 @@ class Command(BaseCommand, object): @@ -137,27 +137,27 @@ class Command(BaseCommand, object):
137 mailinglist=mailinglist, 137 mailinglist=mailinglist,
138 subject_token=subject_slug 138 subject_token=subject_slug
139 )[0] 139 )[0]
140 - 140 +
141 if self.THREAD_CACHE.get(subject_slug) is None: 141 if self.THREAD_CACHE.get(subject_slug) is None:
142 self.THREAD_CACHE[subject_slug] = dict() 142 self.THREAD_CACHE[subject_slug] = dict()
143 self.THREAD_CACHE[subject_slug][mailinglist.id] = thread 143 self.THREAD_CACHE[subject_slug][mailinglist.id] = thread
144 144
145 thread.latest_message = email 145 thread.latest_message = email
146 - thread.save() 146 + thread.save()
147 return thread 147 return thread
148 - 148 +
149 def save_email(self, list_name, email_msg, index): 149 def save_email(self, list_name, email_msg, index):
150 """Save email message into the database.""" 150 """Save email message into the database."""
151 - 151 +
152 # Update last imported message into the DB 152 # Update last imported message into the DB
153 mailinglist, created = MailingList.objects.get_or_create(name=list_name) 153 mailinglist, created = MailingList.objects.get_or_create(name=list_name)
154 mailinglist.last_imported_index = index 154 mailinglist.last_imported_index = index
155 -  
156 - if created: 155 +
  156 + if created:
157 # if the mailinglist is newly created it's sure that the message 157 # if the mailinglist is newly created it's sure that the message
158 # is not in the DB yet. 158 # is not in the DB yet.
159 self.create_email(mailinglist, email_msg) 159 self.create_email(mailinglist, email_msg)
160 - 160 +
161 else: 161 else:
162 # If the message is already at the database don't do anything 162 # If the message is already at the database don't do anything
163 try: 163 try:
@@ -165,11 +165,11 @@ class Command(BaseCommand, object): @@ -165,11 +165,11 @@ class Command(BaseCommand, object):
165 message_id=email_msg.get('Message-ID'), 165 message_id=email_msg.get('Message-ID'),
166 thread__mailinglist=mailinglist 166 thread__mailinglist=mailinglist
167 ) 167 )
168 - 168 +
169 except Message.DoesNotExist: 169 except Message.DoesNotExist:
170 self.create_email(mailinglist, email_msg) 170 self.create_email(mailinglist, email_msg)
171 -  
172 - mailinglist.save() 171 +
  172 + mailinglist.save()
173 173
174 def create_email(self, mailinglist, email_msg): 174 def create_email(self, mailinglist, email_msg):
175 175
@@ -198,59 +198,59 @@ class Command(BaseCommand, object): @@ -198,59 +198,59 @@ class Command(BaseCommand, object):
198 email.thread = self.get_thread(email, mailinglist) 198 email.thread = self.get_thread(email, mailinglist)
199 email.save() 199 email.save()
200 200
201 - @transaction.commit_manually 201 + @transaction.commit_manually
202 def import_emails(self, archives_path, all, exclude_lists=None): 202 def import_emails(self, archives_path, all, exclude_lists=None):
203 - """Get emails from the filesystem from the `archives_path`  
204 - and store them into the database. If `all` is set to True all  
205 - the filesystem storage will be imported otherwise the  
206 - importation will resume from the last message previously 203 + """Get emails from the filesystem from the `archives_path`
  204 + and store them into the database. If `all` is set to True all
  205 + the filesystem storage will be imported otherwise the
  206 + importation will resume from the last message previously
207 imported. The lists set in `exclude_lists` won't be imported. 207 imported. The lists set in `exclude_lists` won't be imported.
208 - 208 +
209 """ 209 """
210 - 210 +
211 count = 0 211 count = 0
212 email_generator = self.get_emails(archives_path, all, exclude_lists) 212 email_generator = self.get_emails(archives_path, all, exclude_lists)
213 for mailinglist_name, msg, index in email_generator: 213 for mailinglist_name, msg, index in email_generator:
214 try: 214 try:
215 self.save_email(mailinglist_name, msg, index) 215 self.save_email(mailinglist_name, msg, index)
216 except: 216 except:
217 - # This anti-pattern is needed to avoid the transations to 217 + # This anti-pattern is needed to avoid the transations to
218 # get stuck in case of errors. 218 # get stuck in case of errors.
219 transaction.rollback() 219 transaction.rollback()
220 raise 220 raise
221 - 221 +
222 count += 1 222 count += 1
223 if count % 1000 == 0: 223 if count % 1000 == 0:
224 transaction.commit() 224 transaction.commit()
225 - 225 +
226 transaction.commit() 226 transaction.commit()
227 - 227 +
228 def handle(self, *args, **options): 228 def handle(self, *args, **options):
229 """Main command method.""" 229 """Main command method."""
230 - 230 +
231 lock_file = '/var/lock/colab/import_emails.lock' 231 lock_file = '/var/lock/colab/import_emails.lock'
232 - 232 +
233 # Already running, so quit 233 # Already running, so quit
234 if os.path.exists(lock_file): 234 if os.path.exists(lock_file):
235 self.log(("This script is already running. (If your are sure it's " 235 self.log(("This script is already running. (If your are sure it's "
236 "not please delete the lock file in %s')") % lock_file) 236 "not please delete the lock file in %s')") % lock_file)
237 sys.exit(0) 237 sys.exit(0)
238 - 238 +
239 if not os.path.exists(os.path.dirname(lock_file)): 239 if not os.path.exists(os.path.dirname(lock_file)):
240 os.mkdir(os.path.dirname(lock_file), 0755) 240 os.mkdir(os.path.dirname(lock_file), 0755)
241 - 241 +
242 run_lock = file(lock_file, 'w') 242 run_lock = file(lock_file, 'w')
243 run_lock.close() 243 run_lock.close()
244 - 244 +
245 archives_path = options.get('archives_path') 245 archives_path = options.get('archives_path')
246 self.log('Using archives_path `%s`' % self.default_archives_path) 246 self.log('Using archives_path `%s`' % self.default_archives_path)
247 - 247 +
248 if not os.path.exists(archives_path): 248 if not os.path.exists(archives_path):
249 raise CommandError('archives_path (%s) does not exist' % 249 raise CommandError('archives_path (%s) does not exist' %
250 archives_path) 250 archives_path)
251 -  
252 - self.import_emails(archives_path, 251 +
  252 + self.import_emails(archives_path,
253 options.get('all'), options.get('exclude_lists')) 253 options.get('all'), options.get('exclude_lists'))
254 - 254 +
255 os.remove(lock_file) 255 os.remove(lock_file)
256 - 256 +