!pip install --upgrade pip
import pandas as pd
import s3fs
import scipy
import matplotlib as plot
import requests as rs
import numpy as np
import zipfile
fs = s3fs.S3FileSystem(anon=True)
fs.ls('datacases/datathon-2018-2/')
fs.ls('datacases/datathon-2018-2/telenor')
#URL="https://storage.googleapis.com/global-datathon-2018/telenor/data.zip"
#print("downloading with requests")
#r = rs.get(URL)
#with open("code.zip", "wb") as code:
# code.write(r.content)
#print("done downloading")
zf = zipfile.ZipFile('code.zip')
Telenor = pd.read_csv(zf.open('data.csv'),encoding="latin",delimiter=";")
Telenor.head(10)
Telenor.shape
Telenor.dtypes
Telenor1 = Telenor[(Telenor.FIRST_GET_RESPONSE_SUCCESS_D==0) & (Telenor.PAGE_BROWSING_DELAY==0) & (Telenor.TCP_SETUP_TOTAL_DELAY==0) &
(Telenor.PAGE_CONTENT_DOWNLOAD_TOTAL_D==0) & (Telenor.FIRST_DNS_RESPONSE_SUCCESS_D ==0) &
(Telenor.DNS_RESPONSE_SUCCESS_DELAY==0) & (Telenor.FIRST_TCP_RESPONSE_SUCCESS_D==0) &
(Telenor.PAGE_SR_DELAYS==0) & (Telenor.SYN_SYN_DELAY==0) & (Telenor.TCP_CONNECT_DELAY==0) & (Telenor.PAGE_BROWSING_DELAYS ==0)]
Telenor1.head(10)
Telenor2 = Telenor[(Telenor.FIRST_GET_RESPONSE_SUCCESS_D!=0) | (Telenor.PAGE_BROWSING_DELAY!=0) | (Telenor.TCP_SETUP_TOTAL_DELAY!=0) |
(Telenor.PAGE_CONTENT_DOWNLOAD_TOTAL_D!=0) | (Telenor.FIRST_DNS_RESPONSE_SUCCESS_D !=0) |
(Telenor.DNS_RESPONSE_SUCCESS_DELAY!=0) | (Telenor.FIRST_TCP_RESPONSE_SUCCESS_D!=0) |
(Telenor.PAGE_SR_DELAYS!=0) | (Telenor.SYN_SYN_DELAY!=0) | (Telenor.TCP_CONNECT_DELAY!=0) | (Telenor.PAGE_BROWSING_DELAYS!=0)]
Telenor2.head(10)
Telenor2[['RAVEN_NAME','DATETIME']].groupby('RAVEN_NAME').count().sort_values(by = 'DATETIME', ascending = False).head(10)
Telenor1[['RAVEN_NAME','DATETIME']].groupby('RAVEN_NAME').count().sort_values(by = 'DATETIME', ascending = False).head(10)
Telenor2[['FAMILY_NAME','DATETIME']].groupby('FAMILY_NAME').count().sort_values(by = 'DATETIME', ascending = False).head(1)
Telenor2[['FAMILY_NAME','DATETIME']].groupby('FAMILY_NAME').count().sort_values(by = 'DATETIME', ascending = True).head(1)
Telenor2[['MEMBER_NAME','DATETIME']].groupby('MEMBER_NAME').count().sort_values(by = 'DATETIME', ascending = False).head(1)
Telenor2[['MEMBER_NAME','DATETIME']].groupby('MEMBER_NAME').count().sort_values(by = 'DATETIME', ascending = True).head(1)
Telenor2.DATETIME = pd.to_datetime(Telenor2.DATETIME)
Telenor2['DATE'] = [d.date() for d in Telenor2['DATETIME']]
Telenor2['TIME'] = [d.time() for d in Telenor2['DATETIME']]
Telenor2.head(10)
Telenor_Failures=Telenor2[['DATE','TCP_CONNECT_DELAY']].groupby('DATE').count()
Telenor_Failures.columns = ['FAILURES']
Telenor_Failures.head()
Telenor_Failures.head()
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected = True)
cf.go_offline()
Telenor_Failures.iplot(title = 'Total Failures on aggregate of every 15 minutes')
from pyramid.arima import auto_arima
stepwise_model = auto_arima(Telenor_Failures, start_p=1, start_q=1,
max_p=3, max_q=3, m=12,
start_P=0, seasonal=True,
d=1, D=1, trace=True,
error_action='ignore',
suppress_warnings=True,
stepwise=True)
print(stepwise_model.aic())