extractor.py 3.59 KB
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Author: Caio Marcelo Campoy Guedes
E-Mail: caiomcg@gmail.com

Author: Erickson Silva
E-Mail: erickson.silva@lavid.ufpb.br

Author: Jorismar Barbosa
E-Mail: jorismar.barbosa@lavid.ufpb.br

Author: Wesnydy Lima Ribeiro
E-Mail: wesnydy@lavid.ufpb.br
"""

import json
import logging
import os
import pika
import PikaManager
import pysrt

from thread import start_new_thread
from time import sleep
from urllib import urlretrieve

# Logging configuration.
logger = logging.getLogger("extractor")
logger.setLevel(logging.DEBUG)

fh = logging.FileHandler("/home/vlibras/log/extractor.log")
fh.setLevel(logging.DEBUG)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
ch.setFormatter(formatter)

logger.addHandler(fh)
logger.addHandler(ch)

# Manager of queues connections.
#manager = PikaManager.PikaManager("150.165.205.10", "test", "test")
manager = PikaManager.PikaManager("rabbit")

def run(ch, method, properties, body):
	"""
    Execute the worker.

    Parameters
    ----------
    ch : object
        Channel of communication.
    method : function
        Callback method.
    properties : object
        Message containing a set of 14 properties.
    body : string
        Json string containing the necessary arguments for workers.
    """
	logger.info("processing request " +  properties.correlation_id.encode("utf-8"))
	body = json.loads(body)
	try:
		logger.info("Downloading subtitle")
		filename = urlretrieve(body["subtitle"].encode("utf-8"))[0]
	except IOError, e:
		logger.error("Download of subtitle fail")
		return

	try:
		# Tries to open file with utf-8 encoding.
		subtitle = pysrt.open(filename)
	except UnicodeDecodeError:
		# Tries to open file with iso-8859-1 encoding if utf-8 encoding fails.
		subtitle = pysrt.open(filename, encoding="iso-8859-1")

	index = 1
	print ("Extracting...")
	logger.info("Extracting subtitles from file")
	for sub in subtitle:
		pts = calculate_ms(str(sub.start))
		message = {"text": sub.text.encode("utf-8"), "pts": pts,  "index": index}
		manager.send_to_queue("extractions", message, properties)
		index += 1
	# Control message indicating the end of subtitles.
	body["control-message"] = "FINALIZE"
	body["pts"] = -1
	body["index"] = index
	logger.info(str(index-1) + " Subtitles extracted successfully")

	logger.info("Cleaning temp files")
	os.remove(filename)

	logger.info("Sending control message to the queue")
	manager.send_to_queue("extractions", body, properties)
	print ("Ok")

def calculate_ms(time_in):
	"""
    Calculates timestamp in milliseconds.

    Parameters
    ----------
    time_in : string
        Time in of timestamp.

    Returns
	-------
	number
	    The timestamp in milliseconds.
    """
	time = time_in.split(':')
	time = time[:2] + time[2].split(',')
	hour = int(time[0]) * 3600000
	minute = int(time[1]) * 60000
	second = int(time[2]) * 1000
	millisec = int(time[3]) + second + minute + hour
	return millisec

def keep_alive(conn_send, conn_receive):
	"""
    Keep the connection alive.

    Parameters
    ----------
    conn_send : object
        Connection of writer.
    conn_receive : object
        Connection of receiver.
    """
	while  True:
		sleep(30)
		try:
			conn_send.process_data_events()
			conn_receive.process_data_events()
		except:
			continue

start_new_thread(keep_alive, (manager.get_conn_send(), manager.get_conn_receive()))

print("Extractor listening...")
while True:
	try:
		manager.receive_from_queue("requests", run)
	except KeyboardInterrupt:
		manager.close_connections()
		os._exit(0)