# import pandas as pd
import pandas as pd
# import numpy as np
import numpy as np
 
# simple array
v = np.array([1,2,3,65,765])
data=v
print("Content of", type(v), "is:",v)
print("VALUE AT POS1 in array is:",(v[0]))

data = pd.Series([1,2,3,65,765])
#data=pd.Series([1,2,3,65,765],index=['a', 'b', 'c', 'd','e'])

#Creating data series
ser = pd.Series(data) ###Series function is used to create a data series
print("Series content is: \n", ser)
print("1st element of a dataseries is:", ser[0])
#retrieve the first N elements of series
print(ser[:3])

"""NOTE: Default index starts from 0"""

### EXPLICITLY ASSIGNING INDEX
ser1 = pd.Series([1,2,3,4,5],index=[11,12,13,14,15])
print(ser1)
ser = pd.Series(data,index=['a','b','c','d','e'])
print(ser)

newser = pd.Series(data,index=range(1,6))
print(newser)
  
"""NOTE: Data series is a single dimensional datatype holding heterogeneous data. 
The axis labels are collectively referred to as the index."""

Content of <class 'numpy.ndarray'> is: [  1   2   3  65 765]
VALUE AT POS1 in array is: 1
Series content is: 
 0      1
1      2
2      3
3     65
4    765
dtype: int64
1st element of a dataseries is: 1
0    1
1    2
2    3
dtype: int64
11    1
12    2
13    3
14    4
15    5
dtype: int64
a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
dtype: float64
1      2.0
2      3.0
3     65.0
4    765.0
5      NaN
dtype: float64

'NOTE: Data series is a single dimensional datatype holding heterogeneous data. \nThe axis labels are collectively referred to as the index.'

import pandas as pd
D1 = {"Numbers": [0,1,2,3,4]}
print(type(D1),D1)
ds = pd.Series(D1)
print(type(ds),ds)
D2 = {'Quarter':['Q1','Q2','Q3','Q4'], 'Sold':[100,120,90,150]}
print(D1)
df=pd.DataFrame(D2)
print(df)
print(df['Quarter'])
print(df[ ['Quarter'] ])
print(df['Sold'])
df['Sold'] > 100
df[df['Sold'] <110]
data = pd.Series([1,2,3,65,765], index=['a', 'b', 'c', 'd','e'])
data

<class 'dict'> {'Numbers': [0, 1, 2, 3, 4]}
<class 'pandas.core.series.Series'> Numbers    [0, 1, 2, 3, 4]
dtype: object
{'Numbers': [0, 1, 2, 3, 4]}
  Quarter  Sold
0      Q1   100
1      Q2   120
2      Q3    90
3      Q4   150
0    Q1
1    Q2
2    Q3
3    Q4
Name: Quarter, dtype: object
  Quarter
0      Q1
1      Q2
2      Q3
3      Q4
0    100
1    120
2     90
3    150
Name: Sold, dtype: int64

a      1
b      2
c      3
d     65
e    765
dtype: int64

"""Before importing pandas it needs to be installed using command pip install pandas."""
import pandas as pd
import numpy as np
v1=[1,2,3,65,765] ##LIST
#v1 = np.array([1,2,3,65,765]) ###ARRAY
data = pd.Series(v1)
print(data)

#Creating data series
ser = pd.Series(data) ###Series function is used to create a data series
print("Series content is: \n", ser)
print("1st element of a dataseries is:", ser[0])
#retrieve the first N elements of series
print(ser[:3])

"""NOTE: Default index starts from 0"""

### EXPLICITLY ASSIGNING INDEX
ser = pd.Series(v1,index=[11,12,13,14,15])
#ser = pd.Series(v1,index=['a','b','c','d','e'])
print(ser)

newser = pd.Series(data,index=range(0,5))
print(newser)
"""NOTE: Data series is a single dimensional datatype holding heterogeneous data. 
The axis labels are collectively referred to as the index."""

0      1
1      2
2      3
3     65
4    765
dtype: int64
Series content is: 
 0      1
1      2
2      3
3     65
4    765
dtype: int64
1st element of a dataseries is: 1
0    1
1    2
2    3
dtype: int64
11      1
12      2
13      3
14     65
15    765
dtype: int64
0      1
1      2
2      3
3     65
4    765
dtype: int64

'NOTE: Data series is a single dimensional datatype holding heterogeneous data. \nThe axis labels are collectively referred to as the index.'

# importing pandas module  
import pandas as pd  
     
### reading csv file
df = pd.read_csv("WBCDdata.csv") 

"""read_csv() automatically converts the file content into a 2D dataframe(table)"""


### Extracting country column(series) from the dataframe df
###Indexing a Series using indexing operator [] :
ser = pd.Series(df['radius_mean']) 

### Reading first 20 instances of the series
data = ser.head(10)
print(data) 

# using indexing operator
print(data[3:6])

# using .iloc() function ###implicit indexing
print(data.iloc[3:6])

0    17.99
1    20.57
2    19.69
3    11.42
4    20.29
5    12.45
6    18.25
7    13.71
8    13.00
9    12.46
Name: radius_mean, dtype: float64
3    11.42
4    20.29
5    12.45
Name: radius_mean, dtype: float64
3    11.42
4    20.29
5    12.45
Name: radius_mean, dtype: float64

####PERFORMING OPERATIONS ON SERIES DATA
# importing pandas module  
import pandas as pd  
 
# creating a series
data1 = pd.Series([5, 2, 3,7], index=['a', 'b', 'c', 'd'])
 
# creating a series
data2 = pd.Series([1, 6, 4, 9], index=['a', 'b', 'd', 'e'])
 
print("data2 details \n", data2, "\n\n", "data1 details \n", data1)
print(data1+data2)
#### Handling nan
data1.add(data2, fill_value=0)

data2 details 
 a    1
b    6
d    4
e    9
dtype: int64 

 data1 details 
 a    5
b    2
c    3
d    7
dtype: int64
a     6.0
b     8.0
c     NaN
d    11.0
e     NaN
dtype: float64

a     6.0
b     8.0
c     3.0
d    11.0
e     9.0
dtype: float64

### DATATYPE CONVERSION OPERATIONS

# importing pandas module  
import pandas as pd 
   
# reading csv file from url  
data = pd.read_csv("WBCDdata.csv") 
    
# dropping null value columns to avoid errors 
data.dropna(inplace = True) 
   
# storing dtype before converting 
before = data.dtypes 
   
### converting dtypes using astype 
data["diagnosis"]= data["diagnosis"].astype(str) 
data["radius_mean"]= data["radius_mean"].astype(str) 
   
# storing dtype after converting 
after = data.dtypes 
   
# printing to compare 
print("BEFORE CONVERSION\n", before, "\n") 
print("AFTER CONVERSION\n", after, "\n") 

### Converting to list
# converting to list 
country_list = data["diagnosis"].tolist()
country_list

BEFORE CONVERSION
 id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
Unnamed: 32                float64
dtype: object 

AFTER CONVERSION
 id                           int64
diagnosis                   object
radius_mean                 object
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
Unnamed: 32                float64
dtype: object

[]

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
x=range(1,1000)
print("original array:", x)
ts = pd.Series(x)
ts = ts.cumsum()
print("data series", ts)
ts.plot()

original array: range(1, 1000)
data series 0           1
1           3
2           6
3          10
4          15
5          21
6          28
7          36
8          45
9          55
10         66
11         78
12         91
13        105
14        120
15        136
16        153
17        171
18        190
19        210
20        231
21        253
22        276
23        300
24        325
25        351
26        378
27        406
28        435
29        465
        ...  
969    470935
970    471906
971    472878
972    473851
973    474825
974    475800
975    476776
976    477753
977    478731
978    479710
979    480690
980    481671
981    482653
982    483636
983    484620
984    485605
985    486591
986    487578
987    488566
988    489555
989    490545
990    491536
991    492528
992    493521
993    494515
994    495510
995    496506
996    497503
997    498501
998    499500
Length: 999, dtype: int64

<matplotlib.axes._subplots.AxesSubplot at 0x254ebd719e8>

INDEXING & SLICING¶

"""Indexing means referring to an element of an iterable by its position within the iterable. “Slicing” means getting a subset of elements from an iterable based on their indices"""
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0, 1.25, 1.5], index=['a', 'b', 'c', 'd', 'e', 'f'])
#ind = pd.Index([1, 3, 5, 7])
#data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[ind])
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
f    1.50
dtype: float64

import pandas as pd
r=('a', 'b', 'c', 'd', 'e', 'f') #Tuple
print(type(r))
c=[0.25, 0.5, 0.75, 1.0, 2, 1.75] #List of elements
print(type(c))
data = pd.Series(c, index=r)
print("original dataseries is: \n", data)
s3 = pd.Series([1.25], index=['g'])
"""You can also include more than one new elements into your data series (as given in line9).
Also you can create index using Index constructor (as given in line10)."""
#my_indx = pd.Index(['g','h','i','j'])
#s3 = pd.Series([40,50,60,70], index=[my_indx])
res=data.append(s3)
print("Appended value is:",s3)
print("Dataseries after appending is\n",res)
print(" result in sorted order is:",res.sort_values(ascending=True))##Check with False

<class 'tuple'>
<class 'list'>
original dataseries is: 
 a    0.25
b    0.50
c    0.75
d    1.00
e    2.00
f    1.75
dtype: float64
Appended value is: g    1.25
dtype: float64
Dataseries after appending is
 a    0.25
b    0.50
c    0.75
d    1.00
e    2.00
f    1.75
g    1.25
dtype: float64
 result in sorted order is: a    0.25
b    0.50
c    0.75
d    1.00
g    1.25
f    1.75
e    2.00
dtype: float64

print(data)
print(data['b']) 
#print(data[3])
print(data[1]) #implicit
"""Please note that output of print(data['b']) and print(data[1]) is same but the former uses explicit/userdefined indexing
where as the later uses implicit indexing scheme which begins with zeroth index."""

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
f    1.50
dtype: float64
0.5
0.5

"Please note that output of print(data['b']) and print(data[1]) is same but the former uses explicit/userdefined indexing\nwhere as the later uses implicit indexing scheme which begins with zeroth index."

###Checking for presence of explicit row indices in pandas
#print(1 in data)
print('b' in data)
print(data.keys())
###Checking for presence of implicit row indices in pandas
print(2 in data)
####Check what happens if you uncomment the next line 
#ind[1]
print(data.values)
data.values[0] #indexing

True
Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')
False
[0.25 0.5  0.75 1.   1.25 1.5 ]

0.25

# using indexing operator
print(data['b':'d'])
# using .iloc() function ###implicit indexing
print(data[1:2])

b    0.50
c    0.75
d    1.00
dtype: float64
b    0.5
dtype: float64

# using .iloc() function ###explicit indexing using loc (location)
print(data.loc['b'])
# using .iloc() function ###implicit indexing (indexed location)
print(data.iloc[1:2])

0.5
b    0.5
dtype: float64