Slexy.org is shutting down and stopped accepting new Pastes on May 4th, 2021.
Existing Pastes will stop being available on or after May 10th, 2021.
Author: Not specified Language: python
Description: Not specified Timestamp: 2017-09-02 00:54:30 +0000
View raw paste Reply
from db import DB
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

import pickle
import matplotlib.pyplot as plt
from datetime import date, timedelta


class Predictor:
    def __init__(self, db, regname=None, polyname=None, slug=''):
        """
        Parameters
        ----------
        db_name: str
            Name of the postgres database. Defaults to 'zero'
        regname: str
            Name of the fitted regression algorithm to use. If None, it creates
            a new one.
        polyname: str
            Name of the fitted polynomial 2 regression algorithm to use. If
            None, it creates a new one.

        """

        self.__db = DB(dbname=db['name'], user=db['user'], passwd=db['pass'])
        self.slug = slug

        pd.options.mode.chained_assignment = None  # default='warn'

        self.__escape_rate = None

        if regname is not None:
            self.__regressor = pickle.load(open(regname, 'rb'))
        else:
            self.__regressor = RandomForestRegressor(n_estimators=50,
                                                     n_jobs=-1,
                                                     criterion='mae')

        if polyname is not None:
            self.__poly = pickle.load(open(polyname, 'rb'))
        else:
            self.__poly = Pipeline([('poly',
                                    PolynomialFeatures(degree=2)),
                                    ('linear',
                                    LinearRegression(fit_intercept=False))])

    def get_escape_rate(self, count, fugas_antes):
        """
        Parameters
        ----------
        count: array
            Number of people in line.
        fugas_antes: array
            Number of escapes before.

        Returns
        -------
        fit: sklearn.pipeline.Pipeline
            Fitted 2-degree polynomial regression for predicting escapes by
            number of people in line.

        """

        X = count.values.reshape(-1, 1)
        model = Pipeline([('poly',
                          PolynomialFeatures(degree=2)),
                          ('linear',
                          LinearRegression(fit_intercept=False))])
        fit = model.fit(X, fugas_antes)

        self.__escape_rate = fit
        return self.__escape_rate

    def query(self, q):
        """
        Executes a SQL query and returns the result as a Pandas Data Frame.

        Parameters
        ----------
        q: str
            String containing the SQL query

        Returns
        -------
            pandas.DataFrame
        """

        return pd.DataFrame.from_dict(self.__db.query(q).dictresult())

    def __get_n_modulos(self, gen):
        """
        Get the aproximation of open modules by ticket_time, in one generation.

        Parameters
        ----------
            gen: int
                Number of the generation

        Returns
        -------
            pandas.DataFrame
                open modules x ticket_time
        """

        query = "WITH mods_busy AS ( "\
                "SELECT DISTINCT q1.modulo, q2.generation, q2.number, "\
                "q2.ticket_time "\
                "FROM queue2 q1, queue2 q2 "\
                "WHERE q1.call_time "\
                "BETWEEN (q2.ticket_time - '15 minutes'::interval) "\
                "AND q2.ticket_time "\
                "AND q2.generation = %s "\
                "GROUP BY q2.generation, q2.number, q1.modulo, "\
                "q2.ticket_time) "\
                "SELECT 5 - count(*) AS mods_open, ticket_time "\
                "FROM mods_busy "\
                "GROUP BY generation, number, ticket_time "\
                "ORDER BY ticket_time" % gen

        df = self.query(query)
        df['ticket_time'] = df['ticket_time'].apply(lambda x: x.timestamp())

        return df

    def __polyfit(self, count, fugas):
        count = count.values.reshape(-1, 1)
        self.__poly.fit(count, fugas)

    def ___polypredict(self, count):
        count = count.values.reshape(-1, 1)
        return self.__poly.predict(count)

    def __get_data(self, date):

        def filtro_fugas(x):
            if x[0] > 7:
                return x[1]
            else:
                return 0

        q = "SELECT (call_time - ticket_time) as demora, count, "\
            "tiempo_atencion, "\
            "ticket_time, fugas_antes, generation, modulos_abiertos "\
            "FROM queue2 "\
            "WHERE ticket_time::date = '%s' "\
            "AND (call_time - ticket_time) < '90 minutes' "\
            "AND line = 'Atención General' "\
            "AND location = '%s' "\
            "ORDER BY ticket_time, count" % \
            (date.strftime('%Y-%m-%d'), self.slug)

        df = self.query(q)

        df['demora'] = df['demora'].apply(lambda x: x.seconds)
        df['day'] = df['ticket_time'].apply(lambda x: x.day)
        df['weekday'] = df['ticket_time'].apply(lambda x: x.weekday())
        df['month'] = df['ticket_time'].apply(lambda x: x.month)
        df['ticket_time'] = df['ticket_time'].apply(lambda x: x.timestamp())
        df['tiempo_atencion'] = df['tiempo_atencion'].\
            apply(lambda x: x.second + 60 * x.minute)
        df['modulos_abiertos'] = df['modulos_abiertos'].fillna(value=0)
        df['fugas_antes'] = df[['count', 'fugas_antes']].\
            apply(filtro_fugas, axis=1)

        return df

    def fit(self, date=None, df=None):
        """
        Trains a Random Forest Regressor with data from a specific date.

        Parameters
        ----------
            date: datetime.date optional
                The date to which to get the data from. Unused if df is given.
            df: pandas.DataFrame
                Data from which fit the regressor. If None, data is queried
                with the date.
        """

        assert((date is None) != (df is None))

        if df is None:
            df = self.__get_data(date, self.slug)

        X = df[['ticket_time', 'count', 'tiempo_atencion', 'day', 'month',
                'weekday', 'modulos_abiertos']]

        self.__polyfit(X['count'], df['fugas_antes'])
        fugas = self.__poly.predict(X['count'].values.reshape(-1, 1))
        fugas_reales = np.asarray(df['fugas_antes'], dtype='float')

        print("error polyfit: %s" % self.___error(fugas, fugas_reales))

        X['fuga_pred'] = fugas

        y = np.asarray(df['demora'], dtype='float')

        length = min(len(X), len(y))

        X = X[:length]
        y = y[:length]

        self.__regressor.fit(X, y)

        print("Training Complete!")

    def incremental_fit(self, dates):
        """
        Trains the regressor with data from a series of dates.

        Parameters
        ----------
            dates: [datetime.date]
                The series of dates to train the regressor with.
        """

        X = None
        for d in dates:
            try:
                df = self.__get_data(d)
                if X is None:
                    X = df
                else:
                    X = pd.concat(objs=(X, df))
            except KeyError:
                pass
        self.fit(df=X)

    def predict(self, ticket_time, count, tiempo_atencion_prom,
                modulos_abiertos):
        """
        Predicts the queue time of a ticket given the time, number of people in
        line, average attention time and number of open modules.

        Parameters
        ----------
        call_time: datetime.datetime
            Time and date of the ticket.
        count: int
            Number of people in line
        tiempo_atencion_prom: datetime.time
            Average attention time.
        modulos_abiertos: int
            Number of open modules
        """


        fuga = self.__escape_rate.predict(count)

        tap = tiempo_atencion_prom.second + tiempo_atencion_prom.minute * 60

        X = pd.DataFrame.from_dict({
            'ticket_time': ticket_time.timestamp(),
            'day': ticket_time.day,
            'weekday': ticket_time.weekday(),
            'month': ticket_time.month,
            'count': count,
            'tiempo_atencion': tap,
            'modulos_abiertos': modulos_abiertos,
            'fuga_pred': fuga
        }).reindex_axis(
                        ['ticket_time', 'count', 'tiempo_atencion',
                         'day', 'month', 'weekday', 'modulos_abiertos',
                         'fuga_pred'],
                        axis=1)

        result = self.__regressor.predict(X)
        return result

    @staticmethod
    def ___error(pred_y, y):
        assert(len(pred_y) == len(y))
        n = len(y)
        suma = sum(abs(pred_y[i] - y[i]) for i in range(n))
        return suma / n

    @staticmethod
    def __promedio(X):
        modulos = X['modulos_abiertos']
        tiempo = (X['count'] - X['fuga_pred']) * X['tiempo_atencion'] / modulos
        return tiempo

    def test(self, date):
        """
        Compares the prediction of the trained regressor with a given date.

        Parameters
        ----------
            date: datetime.date
        """

        df = self.__get_data(date)
        # mods = self.__get_n_modulos(gen)
        fuga = self.__poly.predict(df['count'].values.reshape(-1, 1))

        X = df[['ticket_time', 'count', 'tiempo_atencion', 'day', 'month',
                'weekday', 'modulos_abiertos']]
        X['fuga_pred'] = fuga

        y = np.asarray(df['demora'], dtype='float')

        pred = self.__regressor.predict(X)

        length = min(len(X), len(y))

        X = X[:length]
        y = y[:length]
        df = df[:length]
        pred = pred[:length]

        error = self.___error(pred, y)

        plt.plot(df['ticket_time'], y, 'bo')
        plt.plot(df['ticket_time'], pred, 'ro')
        plt.legend()
        plt.xlabel('time [s]')
        plt.ylabel('wait [s]')

        return pred, error

    def test_modules(self, date):
        df = self.__get_data(date)
        mod1 = df['modulos_abiertos'].apply(lambda x: 5)
        fuga = self.__poly.predict(df['count'].values.reshape(-1, 1))

        X = df[['ticket_time', 'count', 'tiempo_atencion', 'day', 'month',
                'weekday', 'modulos_abiertos']]
        X['fuga_pred'] = fuga

        y = np.asarray(df['demora'], dtype='float')

        pred = self.__regressor.predict(X)

        length = min(len(X), len(y))

        X = X[:length]
        y = y[:length]
        df = df[:length]
        pred = pred[:length]

        error = self.___error(pred, y)

        print(y.mean())
        print(pred.mean())

        plt.plot(df['ticket_time'], y, 'bo')
        plt.plot(df['ticket_time'], pred, 'ro')
        plt.legend()
        plt.xlabel('time [s]')
        plt.ylabel('wait [s]')

        return pred, error

    def save(self, regname, polyname):
        """
        Saves the regressors to files.

        Parameters
        ----------
            regname: str
                Name of the file of Random Forest Regressor.
            polyname: str
                Name of the file of the Polynomial Regressor.
        """

        pickle.dump(self.__regressor, open(regname, 'wb'))
        pickle.dump(self.__poly, open(polyname, 'wb'))
        print("Save complete!")

__my_db = {'name': 'zero', 'user': 'pablore', 'pass': 'root'}
__my_slug = 'Servipag Galería Santiago Centro'

def mainfit():
    pr = Predictor(db=__my_db, slug=__my_slug)
    pr.incremental_fit(daterange(date(2016, 9, 1), date(2016, 9, 30)))
    pr.save('reg', 'poly')


def maintest():
    pr = Predictor(regname='reg', polyname='poly', db=__my_db, slug=__my_slug)
    n = sum(1 for _ in daterange(date(2016, 10, 1), date(2016, 10, 30)))
    errortot = 0
    for d in daterange(date(2016, 10, 1), date(2016, 10, 30)):
        try:
            pred, error = pr.test(d)
            errortot += error
        except KeyError as e:
            pass
    print("error promedio: %s" % (errortot / n))
    # plt.axvline(x=datetime(2016, 9, 30).timestamp(), ymax=5000)
    plt.show()


def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

if __name__ == '__main__':
    #mainfit()
    maintest()
    #pr = Predictor(regname='reg', polyname='poly')
    #pr.test_modules(date(2016, 9, 7))
    #plt.show()
 
View raw paste Reply