#!/usr/bin/env python
"""
_cmst0_long_jobs_

Check for running jobs in the system and evaluates
whether they have been running for longer than expected and alerts
on those which have exceeded the configured threshold.

Availability metrics are defined as:

0 - At least one Express or Repack job is running longer than expected
50 - At least one PromptReco job is running longer than expected
75 - A LogCollect/Cleanup job is running longer than expected
100 - All jobs are running on time
"""

import logging
import threading
import sys
import re
import os
import time

from WMCore.WMInit import connectToDB
from WMCore.Database.DBFormatter import DBFormatter
from WMCore.Configuration import loadConfigurationFile

class JobInfoDAO(DBFormatter):
    """
    DAO to extract information about running jobs from the database,
    it reports information about the time a job has been executing according
    to the records in BossAir, it also reports the type of job.
    It distinguishes between running and pending jobs.
    """
    sql = """SELECT wmbs_job.id AS job_id, wmbs_sub_types.name AS job_type,
                    bl_runjob.status_time AS timestamp, bl_runjob.grid_id,
                    bl_status.name AS status, wmbs_location.plugin,
                    wmbs_workflow.name AS workflow
             FROM wmbs_job
             INNER JOIN bl_runjob ON
             bl_runjob.wmbs_id = wmbs_job.id AND
             bl_runjob.retry_count = wmbs_job.retry_count
             INNER JOIN wmbs_jobgroup ON
             wmbs_jobgroup.id = wmbs_job.jobgroup
             INNER JOIN wmbs_subscription ON
             wmbs_subscription.id = wmbs_jobgroup.subscription
             INNER JOIN wmbs_sub_types ON
             wmbs_subscription.subtype = wmbs_sub_types.id
             INNER JOIN bl_status ON
             bl_status.id = bl_runjob.sched_status
             INNER JOIN wmbs_location ON
             wmbs_job.location = wmbs_location.id
             INNER JOIN wmbs_workflow oN
             wmbs_workflow.id = wmbs_subscription.workflow
             WHERE wmbs_job.state = (SELECT id FROM wmbs_job_state
                                     WHERE wmbs_job_state.name = 'executing')
             """

    def execute(self, conn = None, transaction = False):
        result = self.dbi.processData(self.sql, conn = conn,
                                 transaction = transaction)
        results = self.formatDict(result)
        jobInfo = []
        for entry in results:
            module = __import__("WMCore.BossAir.Plugins.%s" % entry["plugin"],
                                globals(), locals(), [entry["plugin"]])
            plugIn = getattr(module, entry["plugin"])
            state = plugIn.stateMap().get(entry["status"])
            entry["status"] = state
            if entry["timestamp"] is None:
                entry["timestamp"] = time.time()
            if state in ["Running", "Pending"]:
                jobInfo.append(entry)

        return jobInfo

runNumberRegex = re.compile(r"([A-Za-z]+)_Run([0-9]{6})_([A-Za-z0-9_]+)")

def setup():
    """
    _setup_

    Perform any global setup operations.
    Setups the connection to the database and loads the alarm
    configuration. It returns the specific alarm configuration.
    """
    connectToDB()
    configPath = os.path.join(os.environ.get("SLS_CONFIG") or
                              os.environ["T0_ROOT"], "etc/SLSAlarmsConfig.py")
    # Load only the relevant alarm configuration but add anything from Settings
    fullConfig = loadConfigurationFile(configPath)
    config = getattr(fullConfig, "cmst0_long_jobs")
    settings = getattr(fullConfig, "Settings")
    config.section_("Settings")
    config.Settings = settings
    return config

def retrieveLongJobs(config):
    """
    Retrieve information about running jobs from WMBS and BossAir,
    organize this information and extract the jobs that have been running
    above the defined thresholds in the configuration file
    """
    myThread = threading.currentThread()
    retrieveInfo = JobInfoDAO(logger = logging, dbinterface = myThread.dbi)
    jobInfo = retrieveInfo.execute()
    longJobs = {}
    for job in jobInfo:
        workflowName = job["workflow"]
        jobType = job["job_type"]
        statusTime = job["timestamp"]
        status = job["status"]
        try:
            runNumber = runNumberRegex.match(workflowName).groups()[1]
            if runNumber in getattr(config, "RunBlacklist", []):
                # Ignore blacklisted runs
                continue
        except:
            # Regex does not match, skip
            continue
        # Job passes the run list, let's check
        elapsedTime = int(time.time() - float(statusTime))
        thresholdsForStatus = getattr(config.Thresholds, status)
        if hasattr(thresholdsForStatus, jobType):
            threshold = getattr(thresholdsForStatus, jobType)
            if elapsedTime > 3600 * threshold:
                if workflowName not in longJobs:
                    longJobs[workflowName] = {}
                if jobType not in longJobs[workflowName]:
                    longJobs[workflowName][jobType] = []
                shortJobInfo = {"status" : status,
                                "elapsedTime" : elapsedTime / 3600.0,
                                "jobId" : job["job_id"]}
                longJobs[workflowName][jobType].append(shortJobInfo)

    return longJobs

def processLongJobs(longJobs):
    """
    _processLongJobs_

    Write detailed information about the long jobs in a simple
    HTML page
    """
    timestamp = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
    htmlFile = open('/afs/cern.ch/user/c/cmst1/www/LongJobs/LongJobs.html',
                   'w')
    htmlFile.write('<html>\n')
    htmlFile.write('<pre>\n')
    htmlFile.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>\n')
    htmlFile.write('---------------------------------------------------------------------------------------\n')
    htmlFile.write('Report generated on %s UTC\n' % timestamp)
    htmlFile.write('---------------------------------------------------------------------------------------\n')
    htmlFile.write('\n\n')
    for workflow in longJobs:
        htmlFile.write('---------------------------------------------------------------------------------------\n')
        htmlFile.write('Workflow: %s\n' % workflow)
        htmlFile.write('---------------------------------------------------------------------------------------\n')
        for jobType in longJobs[workflow]:
            htmlFile.write('---------------------------------------------------------------------------------------\n')
            htmlFile.write('Job type: %s\n' % jobType)
            htmlFile.write('---------------------------------------------------------------------------------------\n')
            if len(longJobs[workflow][jobType]) > 5:
                htmlFile.write('More than 5 jobs for this workflow/job type, displaying the oldest five.\n')
            jobs = sorted(longJobs[workflow][jobType], key = lambda x : x ['elapsedTime'])[:5]
            for job in jobs:
                htmlFile.write('Job WMBS ID: %12s\tElapsed Time: %2.1f hours\tStatus: %s\n' % (job['jobId'], job['elapsedTime'],
                                                                                               job['status']))
        htmlFile.write('\n\n')

    htmlFile.write('</pre>\n')
    htmlFile.write('</html>\n')
    htmlFile.close()

def calculateAvailability(longJobs):
    """
    _calculateAvailability_

    Calculate the availability of the service,
    according to the guidelines defined in the module
    documentation
    """
    if not longJobs:
        return 100
    availability = 100
    for workflow in longJobs:
        for type in longJobs[workflow]:
            if type in ["Repack", "Express", "Merge", "Processing", "Harvesting"]:
                if "Repack" in workflow or "Express" in workflow:
                    return 0
                elif "PromptReco" in workflow:
                    availability = 50
            elif type in ["LogCollect", "Cleanup"]:
                if availability == 100:
                    availability = 75
    return availability

def buildSLSXML(config, availability):
    """
    _buildSLSXML_

    Builds an XML file for SLS updates based
    on the information in data.
    """
    timezone = str(int(-time.timezone / 3600)).zfill(2)
    timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
    timestamp += "%s:00" % timezone

    # Retrieve the intervention info if any
    interventionInfo = {}
    if hasattr(config, "Intervention"):
        startTime = config.Intervention.startTime
        duration = config.Intervention.duration
        message = config.Intervention.message

        # Check that the intervention is present or in the future
        structStartTime = time.strptime(startTime, "%Y-%m-%dT%H:%M:%S")
        startTimeSeconds = time.mktime(structStartTime)
        if (startTimeSeconds + duration * 3600) >= time.time():
            interventionInfo = {'startTime' : startTime,
                                'duration' : duration,
                                'message' : message}

    intervention = ""
    if interventionInfo:
        inteventionTemplate = """        <interventions>
            <intervention start="{startTime}" length="PT{duration}H">
                {message}
            </intervention>
        </interventions>"""

        intervention = inteventionTemplate.format(**interventionInfo)

    textLines = "<textvalue>Jobs are running on schedule</textvalue>"
    if availability < 100:
        textLines = "<textvalue>There are long running jobs, check http://cmst1.web.cern.ch/CMST1/LongJobs/LongJobs.html for details.</textvalue>"

    template = """<?xml version="1.0" encoding="utf-8"?>
    <serviceupdate>
        <id>CMST0-long-jobs</id>
        <availability>{availability}</availability>
        <timestamp>{timestamp}</timestamp>
        <data>
            {data}
        </data>
{intervention}
    </serviceupdate>\n"""

    xml = template.format(data = textLines, availability = availability,
                          timestamp = timestamp, intervention = intervention)

    # Get the output file path
    xmlFile = getattr(config, "xmlFile", "cmst0_long_jobs.xml")
    try:
        outputFile = open(os.path.join(config.Settings.xmlDir, xmlFile), 'w')
        outputFile.write(xml)
    except:
        print("Couldn't write the XML file")
        traceback.print_exc()
    finally:
        outputFile.close()

    return

def main():
    """
    _main_

    Script's main function
    """
    try:
        # Check if the wmagent config file path exists in the environment
        if "config" in os.environ:
            os.environ['WMAGENT_CONFIG'] = os.path.join(os.environ.get("config"), 'config.py')
        
        config = setup()
        longJobsInfo = retrieveLongJobs(config)
        availability = calculateAvailability(longJobsInfo)
        buildSLSXML(config, availability)
        processLongJobs(longJobsInfo)
        return 0

    except Exception as e:
        timezone = str(int(-time.timezone / 3600)).zfill(2)
        timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+")
        timestamp += "%s:00" % timezone
        sys.stderr.write('\n'+str(timestamp)+'\n')
        raise e

if __name__ == "__main__":
    sys.exit(main())
