wget https://repo.anaconda.com/archive/Anaconda3-
2020.02
-Linux-x86_64.sh
sh Anaconda3-
2020.02
-Linux-x86_64.sh
1
按回车
2
yes
3
选择安装目录,这个时候目录不变,回车
需要两分钟
4
安装完之后需要选择什么什么玩意,选择yes
在根目录中会看见一个anaconda的文件夹,进入文件夹可以看到很多目录
cd ~/anaconda3
cd
bin
可以很多的文件夹
./anaconda --version
./jupyter notebook --no-browser
./jupyter notebook --no-browser --allow-root
ssh端口转发
ssh -N -f -L localhost:
8888
:localhost:
8888
root@{ip地址}
>>>password
执行
print
(
'hello world'
)
:ifconfig
1-6 Jupyter-notebook的使用演示
进入jupeter浏览器中之后
点击new-->Terminal,显示命令行界面
/root/anaconda3/bin/jupyter notebook --no-browser --allow-root
第2章 Numpy入门
2-1 数据科学领域5个常用Python库
numpy
scipy
Pandas
Matplotlib
Scikit-learn
Number
数据处理里面最基础的库
N维数组(矩阵),快速高效,矢量数学运算
高效的index,不需要循环
开源免费跨平台,运行效率足以和C/Matlab媲美
Scipy
依赖于Numpy
专为科学和工程设计
实现了多种常用科学计算:线性代数,傅里叶变换,信号和图像处理
Pandas
结构化数据分析利器(依赖Numpy)
提供了多种高级数据结构:Time-Series,DataFrame,Panel
强大的数据索引和处理能力
Matplotlib
Python 2D绘图领域使用最广泛的套件
基本能取代Matlab的绘图功能(散点,曲线,柱形等)
通过mplot3d可以绘制精美的3D图
Scikit-learn
机器学习的Python模块
建立在Scipy之上,提供了常用的机器学习算法:聚类,回归
简单易学的API接口
2-2 数学基础回顾之矩阵运算
矩阵:是指1xn或者nx1的矩阵
标量:1x1的矩阵
数组:N维的数组,是矩阵的延伸
矩阵加减运算
相加,相减的两个矩阵必须要有相同的行和列
行和列对应元素相加减
数组乘法(点乘)
清华大学出版的线性代数
http://www.wdfxw.net/goDownFiles.aspx?key=92039718
2-3 Array的创建及访问
import munpy as np
list_1 = [1,2,3,4]
array_1 = np.array(list_1)
list_2 = [6,7,8,9]
array_2 = np.array([list_1,list_2])
print(array_2)
print(array_2.size)
print(array_2.shape)
print(array_2.dtype)
array_4 = np.arange(1,10,2)
print(array_4)
np.zeros(5)
np.zeros([2,3])
array([[0., 0., 0.],
[0., 0., 0.]])
np.eye(5)
array([[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.],
[0., 0., 1., 0., 0.],
[0., 0., 0., 1., 0.],
[0., 0., 0., 0., 1.]])
a = np.arange(10)
print(a)
a[2]
a[1:5]
b = np.array([[1,2,3,4],[5,6,7,8]])
b[1,0]
c = np.array([[1,2,3],[4,5,6],[7,8,9]])
c[:2,1:]
2-4 数组与矩阵运算
快速创建数组
import numpy as np
np.random.randn(10)
array([-0.7512065 , 0.97527973, -1.24433992, 0.86890475, -0.51251532,
-0.02522675, -0.40664444, 0.66399272, -0.94669869, 1.52843227])
np.random.randint(10)
np.random.randint(10, size=(2,3))
array([[1, 8, 1],
[6, 1, 8]])
np.random.randint(10,size=20).reshape(4,5)
array([[6, 6, 4, 8, 9],
[0, 9, 1, 0, 8],
[6, 2, 6, 1, 3],
[5, 4, 8, 9, 2]])
数组的运算
a = np.random.randint(10,size=20).reshape(4,5)
b = np.random.randint(10,size=20).reshape(4,5)
a+b
a-b
a * b
a / b
矩阵的命令其实跟数组差不多
np.mat([[1,2,3],[4,5,6]])
matrix([[1, 2, 3],
[4, 5, 6]])
a = np.array([1,2,3])
np.mat(a)
矩阵的运算
A = np.mat(np.random.randint(10,size=20).reshape(4,5))
B = np.mat(np.random.randint(10,size=20).reshape(4,5))
A+B
A-B
A = np.mat(np.random.randint(10,size=20).reshape(4,5))
B = np.mat(np.random.randint(10,size=20).reshape(5,4))
A * B
matrix([[ 59, 137, 69, 80],
[ 77, 174, 124, 142],
[ 48, 128, 44, 124],
[ 54, 121, 102, 94]])
Array常用函数
import numpy as np
a = np.random.randint(10, size=20).reshape(4,5)
np.unqiue(a)
sum(a)
matrix([[21, 16, 24, 17, 9]])
sum(a[0])
sum(a[:,0])
A.max()
a.max(a[0])
a.max(a[:,0])
2-5 Array的input和output
import numpy as np
x = np.arange(10)
f = open('x.pk1','wb')
pickle.dump(x,f)
f = open('x.pk1','rb')
pickle.load(f)
np.save('one_array',x)
np.load('one_array.npy')
a = np.arange(10)
b = np.arange(20)
np.savez('two_array.npz',a=a,b=b)
c = np.load('two_array.npz')
c['a']
c['b']
第3章 Pandas入门
3-1 Pandas Series
import numpy as np
import pandas as pd
s1 = pd.Series([1,2,3,4,1])
s1
0 1
1 2
2 3
3 4
4 1
dtype: int64
s1.values
s1.index
s2 = pd.Series(np.arange(10))
s3 = pd.Series({'1':1,'2':2,'3':3})
s3.index
Index(['1', '2', '3'], dtype='object')
s4 = pd.Series([1,2,3,4],index=['A','B','C','D'])
s4.values
s4.index
s4['A']
s4[s4>2]
s4.to_dict()
{'A': 1, 'B': 2, 'C': 3, 'd': 4}
s5 = pd.Series(s4.to_dict())
index_1 = ['O','A','B','C','D','E']
s6 = pd.Series(s5,index=index_1)
s6
O NaN
A 1.0
B 2.0
C 3.0
D NaN
E NaN
dtype: float64
pd.isnull(s6)
O True
A False
B False
C False
D True
E True
dtype: bool
pd.notnull(s6)
O False
A True
B True
C True
D False
E False
dtype: bool
s6.name = 'demo'
O NaN
A 1.0
B 2.0
C 3.0
D NaN
E NaN
Name: demo, dtype: float64
s6.index.name='demo index'
Index(['O', 'A', 'B', 'C', 'D', 'E'], dtype='object', name='demo index')
s6.index
Index(['O', 'A', 'B', 'C', 'D', 'E'], dtype='object', name='demo index')
s6.values
array([nan, 1., 2., 3., nan, nan])
Jun 2020 | Jun 2019 | Change | Programming Language | Ratings | Change |
---|
1 | 2 | | C | 17.19% | +3.89% |
2 | 1 | | Java | 16.10% | +1.10% |
3 | 3 | | Python | 8.36% | -0.16% |
4 | 4 | | C++ | 5.95% | -1.43% |
5 | 6 | | C# | 4.73% | +0.24% |
6 | 5 | | Visual Basic | 4.69% | +0.07% |
7 | 7 | | JavaScript | 2.27% | -0.44% |
8 | 8 | | PHP | 2.26% | -0.30% |
9 | 22 | | R | 2.19% | +1.27% |
10 | 9 | | SQL | 1.73% | -0.50% |
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import webbrowser
link = 'https://www.tiobe.com/tiobe-index/'
webbrowser.open(link)
df = pd.read_clipboard()
print(type(df))
df.columns
Index(['Jun 2020', 'Jun 2019', 'Change', 'Programming Language', 'Ratings',
'Change.1'],
dtype='object')
df.Change
df_new = DataFrame(df,columns=['Jun 2020', 'Jun 2019', 'Change'])
df["Jun 2019"]
df_new = DataFrame(df,columns=['Change', 'Programming Language', 'Ratings','new_column'])
df_new['new_column'] = range(10)
df_new['new_column'] = np.arange(10)
df_new['new_column'] = pd.Series(np.arange(10))
df_new['new_column'] = pd.Series([100,200],index=[1,2])
3-3 深入理解Series和Dataframe
data = {
'Country': ['Belgium','India','Brazil'],
'Capital':['Brussels','New Belhi','Brasilia'],
'Population':[11190846,1303171035,207847528]
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
s1 = pd.Series(data['Country'], index=['A','B','C'])
A Belgium
B India
C Brazil
dtype: object
s1.name='Country'
s1.values
array(['Belgium', 'India', 'Brazil'], dtype=object)
s1.index
Index(['A', 'B', 'C'], dtype='object')
DataFrame
df1 = DataFrame(data)
cap = df1['Capital']
type(cap)
df1.iterrows()
[i for i in df1.iterrows()]
使用series对象生成DataFrame
s1 = Series(data['Country'])
s2 = Series(data['Capital'])
s3 = Series(data['Population'])
df_new = DataFrame([s1,s2,s3], index=['Country','Capital','Population'])
df_new = df_new.T
print(df_new)
3-4 Pandas-Dataframe-IO操作
pandas.pydata.org/pandas-docs…
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import webbrowser
link = 'https://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)
df1 = pd.read_clipboard()
df1.to_clipboard()
df1.to_csv("df1.csv")
!more df1.csv
df2 = pd.read_csv('df1.csv')
df1.to_json()
pd.read_json(df1.to_json())
df1.to_html('df1.html')
df1.to_excel('df1.xlsx')
pd.read_excel('df1.xlsx')
3-5 DataFrame的Selecting和indexing
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
imdb = pd.read_csv('movie_metadata.csv')
imdb.shape
imdb.head()
imdb.tail()
imdb['director_facebook_likes']
imdb[['director_name','director_name']]
sub_df = imdb[['director_name','movie_title','imdb_score']]
sub_df.iloc[10:20,0:]
tmp_df = imdb[['director_name','movie_title','imdb_score']].iloc[10:20,0:2]
tmp_df.iloc[2:4,:]
tmp_df.loc[15:17,:'director_name']
注:带有i就是基于索引的
3-6 Series和Dataframe的Reindexing
reindex Series
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
s1 = Series([1,2,3,4], index=['A','B','C','D'])
s1.reindex(index=['G','A','B','C','D','F','E'])
s1.reindex(index=['G','A','B','C','D','F','E'],fill_value=10)
s2 = Series(['A','B','C'], index=[1,5,10])
s2.reindex(index=range(15),method='ffill')
reindex dataframe
df1 = DataFrame(np.random.rand(25).reshape([5,5]))
df1 = DataFrame(np.random.rand(25).reshape([5,5]),index= ['A','B','D','E','F'],columns=['c1','c2','c3','c4','c5'])
df1.reindex(index=['A','B','C','D','E','F'])
df1.reindex(columns=['a1','a2','c1','c2'])
df1.reindex(index=['A','B','C','D','E','F'],columns=['a1','a2','c1','c2'])
s1.drop('A')
df1.drop('A',axis=0)
df1.drop('c1',axis=1)
3-7 谈一谈NaN
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
n = np.nan
type(n)
m = 1
print(m+n)
s1 = Series([1,2,np.nan,4,5],index=['A','B','C','D','E'])
s1.isnull()
s1.notnull()
s1.dropna()
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])
dframe.isnull()
dframe.notnull()
dframe.dropna(axis=0,how='any')
dframe.dropna(axis=1,how='any')
删除NaN和NaN的替换
dframe2 = DataFrame([[1,2,3,n],[2,n,5,6],[n,7,n,9],[1,n,n,n]])
dframe2.dropna(thresh=2)
dframe2.fillna(value=1)
dframe2.fillna(value={0:0,1:1,2:2,3:3})
3-8 多级Index
s1 = Series(np.random.randn(6), index=[['1','1','1','2','2','2'],['a','b','c','a','b','c']])
s1['1']['a']
s1[:,'a']
1 2.371817
2 0.782539
dtype: float64
s1.unstack()
a b c
1 0.487468 1.150074 -1.077194
2 1.068546 -0.459993 -0.131780
df2 = DataFrame([s1['1'],s1['2']])
a b c
0 0.487468 1.150074 -1.077194
1 1.068546 -0.459993 -0.131780
s2 = df2.T.unstack()
df = DataFrame(np.arange(16).reshape(4,4))
df = DataFrame(np.arange(16).reshape(4,4), index=[['a','a','b','b'],[1,2,1,2]],columns=[['BJ','TJ','SH','WH'],[1,2,1,2]])
type(df['BJ'][1])
3-9 Mapping和Replace
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df1 = DataFrame({'城市':['北京','上海','广州'],'人口':[1000,2000,3000]})
城市 人口
0 北京 1000
1 上海 2000
2 广州 3000
df1['GDP'] = Series([0.1,0.2,0.3])
df1['GDP'] = df1['城市'].map({'北京':1,'上海':2,'广州':3})
s1 = Series(np.arange(10))
0 0
1 1
2 2
3 3
dtype: int64
s1.replace(to_replace=3,value=np.nan)
0 0.0
1 1.0
2 2.0
3 NaN
dtype: float64
s1.replace([1,2,3],[10,20,30])
s1.replace([1,2,3],[10,20,30]) # 同时替换多个值
第4章 Pandas玩转数据
4-1 DataFrame的简单数学计算
Series的运算
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
s1 = Series([1,2,3], index=['A','B','C'])
s2 = Series([4,5,6,7], index=['B','C','D','E'])
s1+s2
A NaN
B 6.0
C 8.0
D NaN
E NaN
dtype: float64
DataFrame的运算
df1 = DataFrame(np.arange(9).reshape(3,3), index=['A','B','C'], columns=['BJ','SH','WH'])
df2 = DataFrame(np.arange(4).reshape(2,2), index=['A','B'], columns=['BJ','SH'])
df1+df2
BJ GZ SH
A 0.0 NaN 2.0
B 5.0 NaN 7.0
C NaN NaN NaN
df3 = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]],index=['A','B','C'], columns=['c1','c2','c3'])
df3.max()
df3.sum()
df3.min()
df3.describe()
4-2 Series和DataFrame的排序
Series排序
s1 = Series(np.random.randn(10))
s1.sort_values(ascending=False)
s1.sort_index(ascending=True)
DataFrame排序
df1 = DataFrame(np.random.arange(40).reshape(8,5), columns=['A','B','C','D','E'])
df1['A'].sort_values()
df2 = df1.sort_values('A')
df2.sort_index(ascending=False)
df2[['A','B']]
1 保留imdb_score,director_name,movie_title这几个column
2 使用imdb_score排序,倒序
3 使用一行代码
df = pd.read_csv('movie_metadata.csv')[['imdb_score','director_name','movie_title']].sort_values('imdb_score',ascending=False)
df.iloc[:20,:]
4-3 重命名Dataframe的index
df1 = DataFrame(np.random.randn(9).reshape(3,3), index=['BJ','SH','GZ'], columns=['A','B','C'])
df1.index = Series(['bi','sh','gz'])
df1.index = df1.index.map(str.upper)
df1.rename(index=str.lower,columns=str.lower)
df1.rename(index={'BJ':'beijing'}, columns={'A':'a'})
df1.rename(index=lambda x:x+'_ABC')
4-4 DataFrame的merge操作
df1 = DataFrame({'key':['A','y','z'],'data_set_1':[1,2,3]})
df2 = DataFrame({'key':['A','B','C'],'data_set_2':[4,5,6]})
pd.merge(df1,df2,how='left',on='key')
4-5 Concatenate和Combine
arr1 = np.arange(9).reshape(3,3)
arr2 = np.arange(9).reshape(3,3)
np.concatenate([arr1,arr2])
np.concatenate([arr1,arr2],axis=1)
s1 = Series([1,2],index=['X','Y'])
s2 = Series([3,4], index=['A','B'])
pd.concat([s1,s2])
pd.concat([s1,s2], axis=1)
df1 = DataFrame(np.random.randn(9).reshape(3,3), columns=['X','Y','Z'])
df2 = DataFrame(np.random.randn(3,3),columns=['X','Y','A'])
pd.concat([df1,df2])
s1 = Series([1,np.nan,3,np.nan], index=['A','B','C','D'])
s2 = Series([np.nan,2,np.nan,4], index=['A','B','C','D'])
s1.combine_first(s2)
df1 = DataFrame(
'A':[1,np.nan,2,np.nan],
'B':[3,np.nan,4,np.nan],
'C':[5,np.nan,6,np.nan],
df2 = DataFrame(
'A':[np.nan,2,np.nan,1],
'X':[np.nan,3,np.nan,4],
'Y':[np.nan,5,np.nan,6],
df1.combine_first(df2)
4-6 通过apply进行数据预处理
批量改变dataframe中的一列或者一行
df = DataFrame({
'time':[int(time.time()),int(time.time()),int(time.time()),int(time.time()),int(time.time())],
'data':['Symbol:APPL Seqno:0 Price:1623','Symbol:APPL Seqno:0 Price:1623','Symbol:APPL Seqno:0 Price:1623','Symbol:APPL Seqno:0 Price:1623','Symbol:APPL Seqno:1 Price:1623',]
df.to_csv('apply_demo.csv')
pd.read_csv('apply_demo.csv')
df['A']=Series(['a']*5)
df['A'] = df['A'].apply(str.upper)
def foo(line):
r = [i.split(':')[1] for i in line.strip().split(' ')]
return Series([i.split(':')[1] for i in df['data'][0].strip().split(' ')])
df_tmp = df['data'].apply(foo)
df_tmp = df_tmp.rename(columns={0:'Symbol',1:'Seqno',2:'Price'})
df_new = df.combine_first(df_tmp)
del df_new['data']
del df_new['a']
df_new.to_csv('homework/demo_duplicate.cvs')
4-7 通过去重进行数据清洗
df = pd.read_csv('demo_duplicate.csv')
del df['Unnamed: 0']
df['Seqno'].duplicated()
df['Seqno'].unique()
df.size
len(df)
df['Seqno'].unique()
df['Seqno'].drop_duplicates()
df.drop_duplicates(['Seqno'], keep='last')
4-8 时间序列操作基础
from datetime import datetime
s = datetime(2017,9,14)
date_list = [
datetime(2016,9,1),
datetime(2016,9,2),
datetime(2018,8,8),
datetime(2017,11,18),
datetime(2018,11,23)
s1 = Series(np.random.rand(5),index=date_list)
s1[0]
s1[datetime(2016,9,1)]
s1['2016-9-1']
2016-09-01 0.392439
dtype: float64
s1['20160901']
2016-09-01 0.392439
dtype: float64
s1['2016-9']
date_list_new = pd.date_range('2000-01-01',periods=100,freq='5H')
s2 = DataFrame(np.random.randn(100),index=date_list_new)
4-9 时间序列数据的采样和画图
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
t_range = pd.date_range('2016-01-01','2016-12-31')
s1 = Series(np.random.randn(len(t_range)), index=t_range)
s1['2016-01'].mean()
s1_month = s1.resample('M').mean()
s1.resample('H').ffill()
下面展示画图
t_range = pd.date_range('2016-01-01','2016-12-31', freq='H')
stock_df = DataFrame(index=t_range)
stock_df['BABA']=np.random.randint(80,160, size=len(t_range))
stock_df['TENCENT'] = np.random.randint(30,50, size=len(t_range))
stock_df.plot()
析入门与实践
我们看到这个图片上面由于数据较多,折线较密集
weekly_df= DataFrame()
weekly_df['BABA'] = stock_df['BABA'].resample('W').mean()
weekly_df['TENCENT'] = stock_df['TENCENT'].resample('W').mean()
weekly_df.plot()
4-10 数据分箱技术Binning
score_list = np.random.randint(25,100,size=20)
bins=[0,59,70,80,100]
score_cat=pd.cut(score_list,bins)
pd.value_counts(score_cat)
(0, 59] 7
(70, 80] 6
(80, 100] 4
(59, 70] 3
dtype: int64
df = DataFrame()
df['socre'] = score_list
df['student'] = [pd.util.testing.rands(3) for i in range(20)]
df['Categories ']=pd.cut(df['socre'],bins,labels=['Low','OK','Good','Great'])
df.sort_values('socre',ascending=False)
4-11 数据分组技术GroupBy
df1 = pd.read_csv('city_wearther.csv')
g = df1.groupby(df1['city'])
g
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fcf906c5910>
g.groups
{'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'),
'GZ': Int64Index([14, 15, 16, 17, 18, 19], dtype='int64'),
'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')}
df_bj= g.get_group('BJ')
df_bj.mean()
temperature 10.000000
wind 2.833333
dtype: float64
df1.mean()
temperature 7.10
wind 3.35
dtype: float64
list(g)
dict(list(g))
for k,v in dict(list(g)).items():
print(k,v)
4-12 数据聚合技术Aggregation
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df = pd.read_csv('city_wearther.csv')
g = df.groupby('city')
g.describe()
g.get_group('BJ')
g.agg('max')
def foo(attr):
return attr.max()-attr.min()
g.agg(foo)
temperature wind
BJ 22 3
GZ 35 4
SH 30 3
g_new = df.groupby(['city','wind'])
g_new.groups
g.groups
g_new.get_group(('BJ',2))
for (name_1,name_2), group in g_new:
print((name_1,name_2), group)
4-13 透视表
为了更好地展示数据,临时的将表结构进行变更
df.pivot_table
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
df = pd.read_excel('sales-funnel.xlsx')
pd.pivot_table(df, index=['Manager','Rep'], values=['Price'])
pd.pivot_table(df, index=['Manager','Rep'], values=['Price','Quantity'],aggfunc='sum',columns=['Product'],fill_value=0)
4-14 分组和透视功能实战
projects.fivethirtyeight.com/flights/
第5章 绘图和可视化之Matplotlib
5-1 Matplotlib介绍
为什么用Python画图?
GUI太复杂
Excel太头疼
Python简单,免费(sorry Matlab)
什么是matplotlib?
一个Python包
用于2D绘图
非常强大和流行
有很多扩展
hello world in matplotlib
import matplotlib .pyplot as plt
import numpy as np
%matplotlib inline
x = np.linspace(0,2*np.pi,100)
y = np.sin(x)
plt.plot(x,y)
Matplotlib Architecture
Matplotlib的架构
backend:主要处理把图显示到哪里和画到哪里
Artist:图像显示成什么样
scripting:pyplot ,python语法和API
5-2 matplotlib简单绘图之plot
import numpy as np
import matplotlib.pyplot as plt
a = [1,2,3]
b=[4,5,6]
plt.plot(a,b)
plt.show()
%matplotlib inline
%timeit np.arange(10)
plt.plot(a,b,'r--')
c=[10,8,6]
d=[1,8,3]
plt.plot(a,b,c,d)
plt.plot(a,b, 'r--', c,d, 'g+')
t = np.arange(0.0,2.0,0.1)
s = np.sin(t*np.pi)
plt.plot(t,s,'r--',label='aaaa')
plt.plot( t*2,s,'b--',label='bbbb')
plt.xlabel('this is x')
plt.ylabel('this is y')
plt.title('this is title')
plt.legend()
5-3 matplotlib简单绘图之subplot
import numpy as np
import matplotlib.pyplot as plt
x = np.linspace(0.0, 5.0)
y1 = np.sin(np.pi*x)
y2 = np.sin(np.pi*x*2)
plt.plot(x,y1,'b--',label='sin(pi*x)')
plt.ylabel('y1 value')
plt.plot(x,y2,'r--', label='sin(pi*2x)')
plt.ylabel('y2 value')
plt.xlabel('x value')
plt.title('thsi is x-y value')
plt.legend()
plt.subplot(2,1,1)
plt.plot(x,y1,'b--')
plt.ylabel('y1')
plt.subplot(2,1,2)
plt.plot(x,y2,'r--')
plt.ylabel('y2')
plt.xlabel('x')
还有下面这种方式画子图,两行两列,第几个
另外一种画子图的方式
figure,ax = plt.subplots(2,2)
ax[0][0].plot(x,y1)
ax[0][1].plot(x,y2)
5-4 Pandas绘图之Series
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
s1 = Series(np.random.randn(1000)).cumsum()
s2 = Series(np.random.randn(1000)).cumsum()
s1.plot(kind='line',grid=True,label='S1',title='this is Series',style='r--')
s2.plot(label='S2')
plt.legend()
fig,ax = plt.subplots(2,1)
ax[0].plot(s1)
ax[1].plot(s2)
fig,ax = plt.subplots(2,1)
s1.plot(ax=ax[0],label='S1',style='r--')
s2.plot(ax=ax[1],label='S2',style='b')
fig,ax = plt.subplots(2,1)
s1[0:10].plot(ax=ax[0],label='S1',kind='bar')
s2.plot(ax=ax[1],label='S2',kind='line')
5-5 Pandas绘图之DataFrame
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
df = DataFrame(
np.random.randint(1,10,40).reshape(10,4),
columns=['A','B','C','D']
df.plot(kind='barh')
df.plot(kind='bar',stacked=True)
df.plot(kind='area')
df.iloc[5].plot()
for i in df.index:
df.iloc[i].plot(label=str(i))
plt.legend()
for i in df.columns:
df[i].plot(label=str(i))
plt.legend()
df.T.plot()
5-6 直方图和密度图
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
s = Series(np.random.randn(1000))
plt.hist(s,rwidth=0.9)
plt.hist(s,rwidth=0.9,bins=20,color='r')
s.plot(kind='kde')
39.png)
第6章 绘图和可视化之Seaborn
Seaborn是对Matplotlib的进一步封装,其强大的调色功能和内置的多种多样的绘图模式,使之成为当下最流行的数据科学绘图工具。本章将介绍Seaborn的基本使用,以及和matplotlib的功能对比。
6-1 seaborn介绍
Seaborn- Powerful Matplotlib Extension :强大的Matplotlib扩展
statistical data visualization 统计数据可视化
Seaborn的优势在哪里?
简单,快捷的可视化工具包
鸢尾花长度散点图
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
iris = pd.read_csv('iris.csv')
iris.Name.unique()
color_map = dict(zip(iris.Name.unique(), ['blue','green','red']))
matplotlib画图
for species,group in iris.groupby('Name'):
plt.scatter(
group['PetalLength'],group['SepalLength'],
color=color_map[species],
alpha=0.3,edgecolor=None,
label=species
plt.legend(frameon=True,title='Name')
plt.xlabel('petalLength')
plt.ylabel('sepalLength')
使用seaborn画图
sns.lmplot('PetalLength','SepalLength',iris,hue='Name',fit_reg = False)
6-2 seaborn实现直方图和密度图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
%matplotlib inline
import seaborn as sns
s1 = Series(np.random.randn(1000))
plt.hist(s1)
s1.plot(kind='kde')
sns.distplot(s1,hist=True,kde=True,rug=True)
seaborn直接画图
sns.kdeplot(s1,shade=True,color='green')
plt.hist(s1)
sns.rugplot(s1)
6-3 seaborn实现柱状图和热力图
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
获取一些试验的数据,采用下面的方法实现
df = sns.load_dataset('flights')
Michael Waskom的github,上面有很多的csv文件实例
github.com/mwaskom/sea…
df = df.pivot(index='month',columns='year', values='passengers')
生成热力图--sns.heatmap
sns.heatmap(df,annot=True,fmt='d')
df.plot(kind='area')
s = df.sum()
sns.barplot(x=s.index,y=s.values)
s.plot(kind='bar')
6-4 seaborn图形显示效果的设置
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas import Series,DataFrame
import seaborn as sns
%matplotlib inline
x = np.linspace(0,14,100)
y1 = np.sin(x)
y2 = np.sin(x+1)*1.25
def sinplot():
plt.plot(x,y1,color='red',)
plt.plot(x,y2,label='blue')
sinplot()
style=['darkgrid','dark','white','whitegrid','tricks']
sns.set_style(style[0],{'grid.color': 'blue'})
sinplot()
sns.axes_style()
sns.set()
plotting_context()and set_context()
context = ['paper','notebook','poster']
sns.set_context(context[1],rc={'grid.linewidth':3.0})
sns.plotting_context()
sns.set()
6-5 seaborn强大的调色功能
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
def sinplot():
x = np.linspace(0,14,100)
plt.figure(figsize=(8,6))
for i in range(4):
plt.plot(x, np.sin(x+i)*(i+0.75),label='sin(x+{})*({}+0.75)'.format(i,i))
plt.legend()
import seaborn as sns
sinplot()
sns.color_palette()
sns.palplot([(0.2980392156862745, 0.4470588235294118, 0.6901960784313725)])
pal_style=['deep','muted','pastel','bright','dark','colorblind']
sns.palplot(sns.color_palette(pal_style[4]))
sns.set_palette(sns.color_palette(pal_style[4]))
with sns.color_palette('dark'):
sinplot()
pal2 = sns.color_palette('hls',8)
sns.palplot(pal2)
seaborn官网消息讲述色板博文
seaborn.pydata.org/tutorial/co…
第7章 数据分析项目实战
通过前六章的学习,我们基本上掌握了数据分析领域里主要工具的使用,本章将通过一个股票市场的分析实战项目,和大家一起用学过的知识去分析数据,进而得到有用的信息。
7-1 实战准备
数据分析数据源
https://github.com/chenjiandongx/bili-spider
https://www.zhihu.com/question/19969760
https://www.kaggle.com/
google public data
https://registry.opendata.aws/
7-2 股票市场分析实战之数据获取
雅虎上面获取股票数据
finance.yahoo.com/
使用工具去抓取股票的信息
pandas-datareader
安装pandas-datareader
/root/anaconda3/bin
pip install pandas_datareader
https://pandas-datareader.readthedocs.io/en/latest/
阿里巴巴股票数据的获取
import pandas_datareader as pdr
alibaba = pdr.get_data_yahoo('BABA')
alibaba.head()
alibaba.shape
alibaba.tail()
alibaba.describe()
alibaba.info
7-3 股票市场分析实战之历史趋势分析
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import pandas_datareader as pdr
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime
从雅虎上获取阿里巴巴和亚马逊的股票数据
start = datetime(2015,9,20)
end = datetime(2017,11,30)
alibaba = pdr.get_data_yahoo('BABA',start=start,end=end)
amazon = pdr.get_data_yahoo('AMZN',start=start,end=end)
df直接画图
alibaba['Adj Close'].plot(legend=True)
alibaba['Volume'].plot(legend=True)
alibaba['Adj Close'].plot()
amazon['Adj Close'].plot()
alibaba['high-low'] = alibaba['High']-alibaba['Low']
alibaba['high-low'].plot()
alibaba['daily-return'] = alibaba['Adj Close'].pct_change()
alibaba['daily-return'].plot()
alibaba['daily-return'].plot(figsize=(10,4),linestyle='--',marker='o')
alibaba['daily-return'].plot(kind='hist')
seaborn来画图
sns.distplot(alibaba['daily-return'].dropna(),bins=100,color='purple')
7-4 股票市场分析实战之风险分析
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import pandas_datareader as pdr
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime
start = datetime(2018,1,1)
company=['AAPL','GOOG','MSFT','AMZN','FB']
top_tech_df = pdr.get_data_yahoo(company,start=start)['Adj Close']
top_tech_dr = top_tech_df.pct_change()
top_tech_df.plot()
top_tech_df[['AAPL','FB','MSFT']].plot()
sns.jointplot('AMZN','GOOG',top_tech_dr,kind='scatter')
sns.pairplot(top_tech_dr.dropna())
vips.pct_change().quantile(0.2)
写在后面:
机器学习和数据分析我们在工作和日常中都可以使用到的
在后面的学习过程中,我们可以去kaggle中找到相关案例,然后练习
www.kaggle.com/kernels