In [ ]:
!pip install --upgrade pip
Cache entry deserialization failed, entry ignored
Collecting pip
  Using cached https://files.pythonhosted.org/packages/5f/25/e52d3f31441505a5f3af41213346e5b6c221c9e086a166f3703d2ddaf940/pip-18.0-py2.py3-none-any.whl
Installing collected packages: pip
  Found existing installation: pip 9.0.3
    Uninstalling pip-9.0.3:
In [3]:
import pandas as pd
import s3fs
import scipy
import matplotlib as plot
import requests as rs
import numpy as np
import zipfile
In [4]:
fs = s3fs.S3FileSystem(anon=True)
In [5]:
fs.ls('datacases/datathon-2018-2/')
Out[5]:
['datacases/datathon-2018-2/kaufland',
 'datacases/datathon-2018-2/nsi',
 'datacases/datathon-2018-2/ontotext',
 'datacases/datathon-2018-2/telelink',
 'datacases/datathon-2018-2/telenor']
In [6]:
fs.ls('datacases/datathon-2018-2/telenor')
Out[6]:
['datacases/datathon-2018-2/telenor/data.zip']
In [14]:
#URL="https://storage.googleapis.com/global-datathon-2018/telenor/data.zip"

#print("downloading with requests")
#r = rs.get(URL)
#with open("code.zip", "wb") as code:
#    code.write(r.content)    
#print("done downloading")
downloading with requests
done downloading

Loading the Telenor Dataset

In [7]:
zf = zipfile.ZipFile('code.zip')

Telenor = pd.read_csv(zf.open('data.csv'),encoding="latin",delimiter=";")
In [16]:
Telenor.head(10)
Out[16]:
DATETIME RAVEN_NAME FAMILY_NAME MEMBER_NAME NETWORK FIRST_GET_RESPONSE_SUCCESS_D PAGE_BROWSING_DELAY TCP_SETUP_TOTAL_DELAY PAGE_CONTENT_DOWNLOAD_TOTAL_D FIRST_DNS_RESPONSE_SUCCESS_D DNS_RESPONSE_SUCCESS_DELAY FIRST_TCP_RESPONSE_SUCCESS_D PAGE_SR_DELAYS SYN_SYN_DELAY TCP_CONNECT_DELAY PAGE_BROWSING_DELAYS
0 2018-07-06 10:15 Razzle Dazzle Rose raven Shadow Targerian Maester Aemon 2g 644 2832 2100 730 2 2 2100 2746 4 2100 0
1 2018-07-06 19:30 Tiny Insolent raven Lulu Targerian Maester Aemon 4g 258 51752 260 51334 0 0 260 676 8 260 0
2 2018-07-06 18:30 Blithe And Beautiful raven Mo Targerian Rheagar 3g 1094 19562 470 17130 4 4 470 3526 6 470 0
3 2018-07-06 00:45 Illuminating Emerald raven Daffy Lannister Kevan 2g 432 1488 558 638 272 272 558 1282 430 558 0
4 2018-07-06 12:00 Blithe And Beautiful raven Mo Greyjoy Theon 4g 1094 25132 576 13054 4 4 576 13172 462 576 0
5 2018-07-06 16:15 Blithe And Beautiful raven Mo Greyjoy Aeron 2g 44856 51494 436 50576 2 2 436 45774 14 436 0
6 2018-07-06 03:45 Goofy raven Cleo Baelish Petyr Baelish 4g 434 3018 458 2178 366 366 458 1274 366 458 0
7 2018-07-06 04:45 Shocking Pink raven Wilbur Lannister Cersei 2g 17138 24412 1818 21714 382 382 1818 19836 1288 1818 0
8 2018-07-06 10:30 Shocking Pink raven Wilbur Targerian Maester Aemon 2g 0 1760 448 1290 2 2 448 470 384 448 0
9 2018-07-06 12:15 Shocking Pink raven Wilbur Greyjoy Balon 2g 1286 3834 1692 1628 2 2 1692 3492 16 1692 0
In [19]:
Telenor.shape
Out[19]:
(30091754, 16)
In [20]:
Telenor.dtypes
Out[20]:
DATETIME                         object
RAVEN_NAME                       object
FAMILY_NAME                      object
MEMBER_NAME                      object
NETWORK                          object
FIRST_GET_RESPONSE_SUCCESS_D      int64
PAGE_BROWSING_DELAY               int64
TCP_SETUP_TOTAL_DELAY             int64
PAGE_CONTENT_DOWNLOAD_TOTAL_D     int64
FIRST_DNS_RESPONSE_SUCCESS_D      int64
DNS_RESPONSE_SUCCESS_DELAY        int64
FIRST_TCP_RESPONSE_SUCCESS_D      int64
PAGE_SR_DELAYS                    int64
SYN_SYN_DELAY                     int64
TCP_CONNECT_DELAY                 int64
PAGE_BROWSING_DELAYS              int64
dtype: object

The rows having zero delay in all the delay columns are considered to be the success

Subsetting the data to get all the success entries(Telenor1)

In [8]:
Telenor1 = Telenor[(Telenor.FIRST_GET_RESPONSE_SUCCESS_D==0) & (Telenor.PAGE_BROWSING_DELAY==0) & (Telenor.TCP_SETUP_TOTAL_DELAY==0) &
                   (Telenor.PAGE_CONTENT_DOWNLOAD_TOTAL_D==0) & (Telenor.FIRST_DNS_RESPONSE_SUCCESS_D ==0) &
                   (Telenor.DNS_RESPONSE_SUCCESS_DELAY==0) & (Telenor.FIRST_TCP_RESPONSE_SUCCESS_D==0) &
                   (Telenor.PAGE_SR_DELAYS==0) & (Telenor.SYN_SYN_DELAY==0) & (Telenor.TCP_CONNECT_DELAY==0) & (Telenor.PAGE_BROWSING_DELAYS ==0)]
Telenor1.head(10)
Out[8]:
DATETIME RAVEN_NAME FAMILY_NAME MEMBER_NAME NETWORK FIRST_GET_RESPONSE_SUCCESS_D PAGE_BROWSING_DELAY TCP_SETUP_TOTAL_DELAY PAGE_CONTENT_DOWNLOAD_TOTAL_D FIRST_DNS_RESPONSE_SUCCESS_D DNS_RESPONSE_SUCCESS_DELAY FIRST_TCP_RESPONSE_SUCCESS_D PAGE_SR_DELAYS SYN_SYN_DELAY TCP_CONNECT_DELAY PAGE_BROWSING_DELAYS
2914 2018-07-06 00:00 Adventurous raven Lola Stark Eddard 2g 0 0 0 0 0 0 0 0 0 0 0
6252 2018-07-07 10:45 Magical Metal raven Bibi Stark Benjen 2g 0 0 0 0 0 0 0 0 0 0 0
19409 2018-07-11 23:30 Tarnished Gold raven Bo Lannister Tyrion 2g 0 0 0 0 0 0 0 0 0 0 0
22808 2018-07-12 08:45 Small Dark-Gray raven Azul Stark Maester Luwin 2g 0 0 0 0 0 0 0 0 0 0 0
24035 2018-07-12 17:45 Green Sheen raven Azul Lannister Jamie 4g 0 0 0 0 0 0 0 0 0 0 0
33452 2018-07-28 23:00 Less Combative raven Zazu Targerian Deanerys 2g 0 0 0 0 0 0 0 0 0 0 0
40896 2018-07-30 09:30 Blue raven Phoenix Baelish Petyr Baelish 2g 0 0 0 0 0 0 0 0 0 0 0
43656 2018-07-31 19:30 Continuous Little raven Paco Stark Eddard 2g 0 0 0 0 0 0 0 0 0 0 0
44448 2018-07-31 12:45 Enough Fried raven Tweety Stark Robb 2g 0 0 0 0 0 0 0 0 0 0 0
50891 2018-08-02 08:00 Bronze raven Phoenix Targerian Maester Aemon 2g 0 0 0 0 0 0 0 0 0 0 0

Subsetting the data to get all the failure entries(Telenor2)

In [9]:
Telenor2 = Telenor[(Telenor.FIRST_GET_RESPONSE_SUCCESS_D!=0) | (Telenor.PAGE_BROWSING_DELAY!=0) | (Telenor.TCP_SETUP_TOTAL_DELAY!=0) |
                   (Telenor.PAGE_CONTENT_DOWNLOAD_TOTAL_D!=0) | (Telenor.FIRST_DNS_RESPONSE_SUCCESS_D !=0) |
                   (Telenor.DNS_RESPONSE_SUCCESS_DELAY!=0) | (Telenor.FIRST_TCP_RESPONSE_SUCCESS_D!=0) |
                   (Telenor.PAGE_SR_DELAYS!=0) | (Telenor.SYN_SYN_DELAY!=0) | (Telenor.TCP_CONNECT_DELAY!=0) | (Telenor.PAGE_BROWSING_DELAYS!=0)]
Telenor2.head(10)
Out[9]:
DATETIME RAVEN_NAME FAMILY_NAME MEMBER_NAME NETWORK FIRST_GET_RESPONSE_SUCCESS_D PAGE_BROWSING_DELAY TCP_SETUP_TOTAL_DELAY PAGE_CONTENT_DOWNLOAD_TOTAL_D FIRST_DNS_RESPONSE_SUCCESS_D DNS_RESPONSE_SUCCESS_DELAY FIRST_TCP_RESPONSE_SUCCESS_D PAGE_SR_DELAYS SYN_SYN_DELAY TCP_CONNECT_DELAY PAGE_BROWSING_DELAYS
0 2018-07-06 10:15 Razzle Dazzle Rose raven Shadow Targerian Maester Aemon 2g 644 2832 2100 730 2 2 2100 2746 4 2100 0
1 2018-07-06 19:30 Tiny Insolent raven Lulu Targerian Maester Aemon 4g 258 51752 260 51334 0 0 260 676 8 260 0
2 2018-07-06 18:30 Blithe And Beautiful raven Mo Targerian Rheagar 3g 1094 19562 470 17130 4 4 470 3526 6 470 0
3 2018-07-06 00:45 Illuminating Emerald raven Daffy Lannister Kevan 2g 432 1488 558 638 272 272 558 1282 430 558 0
4 2018-07-06 12:00 Blithe And Beautiful raven Mo Greyjoy Theon 4g 1094 25132 576 13054 4 4 576 13172 462 576 0
5 2018-07-06 16:15 Blithe And Beautiful raven Mo Greyjoy Aeron 2g 44856 51494 436 50576 2 2 436 45774 14 436 0
6 2018-07-06 03:45 Goofy raven Cleo Baelish Petyr Baelish 4g 434 3018 458 2178 366 366 458 1274 366 458 0
7 2018-07-06 04:45 Shocking Pink raven Wilbur Lannister Cersei 2g 17138 24412 1818 21714 382 382 1818 19836 1288 1818 0
8 2018-07-06 10:30 Shocking Pink raven Wilbur Targerian Maester Aemon 2g 0 1760 448 1290 2 2 448 470 384 448 0
9 2018-07-06 12:15 Shocking Pink raven Wilbur Greyjoy Balon 2g 1286 3834 1692 1628 2 2 1692 3492 16 1692 0

Top 10 ravens with fails

In [10]:
Telenor2[['RAVEN_NAME','DATETIME']].groupby('RAVEN_NAME').count().sort_values(by = 'DATETIME', ascending = False).head(10)
Out[10]:
DATETIME
RAVEN_NAME
Brass raven Birdy 218372
Brown raven Ruby 210211
Yellow raven Rio 209263
Blue raven Axel 190226
Razzle Dazzle Rose raven Cleo 186966
Cadmium Red raven Bubba 184785
Vain And Lazy raven Polly 177113
Fearful Carrion raven Gizmo 175197
Blast Off Bronze raven Zazu 169995
Loving raven Maxwell 169584

Top 10 ravens without fails

In [11]:
Telenor1[['RAVEN_NAME','DATETIME']].groupby('RAVEN_NAME').count().sort_values(by = 'DATETIME', ascending = False).head(10)
Out[11]:
DATETIME
RAVEN_NAME
Metallic Sunburst raven Polly 297
Green Sheen raven Azul 211
Less Combative raven Zazu 191
Weak raven Buddy 188
Copper raven Tweety 179
Spectral Yellow raven Zazu 148
Mythical raven Tiki 116
Cyber Grape raven Faith 104
Mysterious And Venerable raven Bubba 98
Shadow Blue raven Sammy 95

The family with most fails

In [12]:
Telenor2[['FAMILY_NAME','DATETIME']].groupby('FAMILY_NAME').count().sort_values(by = 'DATETIME', ascending = False).head(1)
Out[12]:
DATETIME
FAMILY_NAME
Targerian 8154000

The family with least fails

In [13]:
Telenor2[['FAMILY_NAME','DATETIME']].groupby('FAMILY_NAME').count().sort_values(by = 'DATETIME', ascending = True).head(1)
Out[13]:
DATETIME
FAMILY_NAME
Baelish 2744406

The family member with most fails

In [ ]:
Telenor2[['MEMBER_NAME']].groupby('MEMBER_NAME').count().sort_values(by = 'DATETIME', ascending = False).head(1)

The family member with least fails

In [15]:
Telenor2[['MEMBER_NAME','DATETIME']].groupby('MEMBER_NAME').count().sort_values(by = 'DATETIME', ascending = True).head(1)
Out[15]:
DATETIME
MEMBER_NAME
Euron 491454

Changing the datetime column in to dateformat

In [16]:
Telenor2.DATETIME = pd.to_datetime(Telenor2.DATETIME)
/opt/conda/lib/python3.6/site-packages/pandas/core/generic.py:4405: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

Splitting the DATETIME column into DATE and TIME column

In [23]:
Telenor2['DATE'] = [d.date() for d in Telenor2['DATETIME']]
Telenor2['TIME'] = [d.time() for d in Telenor2['DATETIME']]
Telenor2.head(10)
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
Out[23]:
DATETIME RAVEN_NAME FAMILY_NAME MEMBER_NAME NETWORK FIRST_GET_RESPONSE_SUCCESS_D PAGE_BROWSING_DELAY TCP_SETUP_TOTAL_DELAY PAGE_CONTENT_DOWNLOAD_TOTAL_D FIRST_DNS_RESPONSE_SUCCESS_D DNS_RESPONSE_SUCCESS_DELAY FIRST_TCP_RESPONSE_SUCCESS_D PAGE_SR_DELAYS SYN_SYN_DELAY TCP_CONNECT_DELAY PAGE_BROWSING_DELAYS DATE TIME
0 2018-07-06 10:15:00 Razzle Dazzle Rose raven Shadow Targerian Maester Aemon 2g 644 2832 2100 730 2 2 2100 2746 4 2100 0 2018-07-06 10:15:00
1 2018-07-06 19:30:00 Tiny Insolent raven Lulu Targerian Maester Aemon 4g 258 51752 260 51334 0 0 260 676 8 260 0 2018-07-06 19:30:00
2 2018-07-06 18:30:00 Blithe And Beautiful raven Mo Targerian Rheagar 3g 1094 19562 470 17130 4 4 470 3526 6 470 0 2018-07-06 18:30:00
3 2018-07-06 00:45:00 Illuminating Emerald raven Daffy Lannister Kevan 2g 432 1488 558 638 272 272 558 1282 430 558 0 2018-07-06 00:45:00
4 2018-07-06 12:00:00 Blithe And Beautiful raven Mo Greyjoy Theon 4g 1094 25132 576 13054 4 4 576 13172 462 576 0 2018-07-06 12:00:00
5 2018-07-06 16:15:00 Blithe And Beautiful raven Mo Greyjoy Aeron 2g 44856 51494 436 50576 2 2 436 45774 14 436 0 2018-07-06 16:15:00
6 2018-07-06 03:45:00 Goofy raven Cleo Baelish Petyr Baelish 4g 434 3018 458 2178 366 366 458 1274 366 458 0 2018-07-06 03:45:00
7 2018-07-06 04:45:00 Shocking Pink raven Wilbur Lannister Cersei 2g 17138 24412 1818 21714 382 382 1818 19836 1288 1818 0 2018-07-06 04:45:00
8 2018-07-06 10:30:00 Shocking Pink raven Wilbur Targerian Maester Aemon 2g 0 1760 448 1290 2 2 448 470 384 448 0 2018-07-06 10:30:00
9 2018-07-06 12:15:00 Shocking Pink raven Wilbur Greyjoy Balon 2g 1286 3834 1692 1628 2 2 1692 3492 16 1692 0 2018-07-06 12:15:00
In [24]:
Telenor_Failures=Telenor2[['DATE','TCP_CONNECT_DELAY']].groupby('DATE').count()

Telenor_Failures.columns = ['FAILURES']
Telenor_Failures.head()
Out[24]:
FAILURES
DATE
2018-07-06 978982
2018-07-07 968497
2018-07-08 946722
2018-07-09 965899
2018-07-10 975015
In [33]:
Telenor_Failures.head()
Out[33]:
FAILURES
DATE
2018-07-06 978982
2018-07-07 968497
2018-07-08 946722
2018-07-09 965899
2018-07-10 975015
In [25]:
Telenor_Failures.to_csv('fail.csv')
In [26]:
dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d')
data = pd.read_csv('fail.csv', parse_dates=['DATE'], index_col='DATE',date_parser=dateparse)
data.head()
Out[26]:
FAILURES
DATE
2018-07-06 978982
2018-07-07 968497
2018-07-08 946722
2018-07-09 965899
2018-07-10 975015
In [40]:
data.dtypes
Out[40]:
FAILURES    int64
dtype: object
In [27]:
ts = Telenor_Failures['FAILURES']
ts.head()
Out[27]:
DATE
2018-07-06    978982
2018-07-07    968497
2018-07-08    946722
2018-07-09    965899
2018-07-10    975015
Name: FAILURES, dtype: int64
In [28]:
import matplotlib.pylab as plt
%matplotlib inline
plt.plot(ts)
Out[28]:
[<matplotlib.lines.Line2D at 0x7f9ed24acd30>]
In [29]:
## Check for stationarity
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    
    #Determing rolling statistics
    rolmean = pd.rolling_mean(timeseries, window=12)
    rolstd = pd.rolling_std(timeseries, window=12)

    #Plot rolling statistics:
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    #Perform Dickey-Fuller test:
    print('Results of Dickey-Fuller Test:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
In [30]:
import numpy as np
In [32]:
#test_stationarity(ts)
In [34]:
#fig = decomposition.plot()
In [ ]:
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
In [41]:
init_notebook_mode(connected = True)

cf.go_offline()
In [ ]:
Telenor_Failures.iplot(title = 'Total Failures on aggregate of every 15 minutes')
In [ ]:
Telenor_Failures.head()
In [31]:
from pyramid.arima import auto_arima
stepwise_model = auto_arima(Telenor_Failures, start_p=1, start_q=1,
                           max_p=3, max_q=3, m=12,
                           start_P=0, seasonal=True,
                           d=1, D=1, trace=True,
                           error_action='ignore',  
                           suppress_warnings=True, 
                           stepwise=True)
print(stepwise_model.aic())
Fit ARIMA: order=(1, 1, 1) seasonal_order=(0, 1, 1, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(0, 1, 0, 12); AIC=427.823, BIC=429.604, Fit time=0.058 seconds
Fit ARIMA: order=(1, 1, 0) seasonal_order=(1, 1, 0, 12); AIC=429.385, BIC=432.947, Fit time=0.192 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 1, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(1, 1, 0, 12); AIC=427.909, BIC=430.580, Fit time=0.066 seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(0, 1, 1, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(1, 1, 1, 12); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(1, 1, 0) seasonal_order=(0, 1, 0, 12); AIC=429.302, BIC=431.973, Fit time=0.052 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 1, 0, 12); AIC=429.294, BIC=431.965, Fit time=0.064 seconds
Fit ARIMA: order=(1, 1, 1) seasonal_order=(0, 1, 0, 12); AIC=431.001, BIC=434.562, Fit time=0.088 seconds
Total fit time: 0.545 seconds
427.8234729008545
In [ ]:
stepwise_model.predict(5)
In [ ]:
print(stepwise_model.aic())