Pandas
学会Pandas
Pandas最初被作为金融数据分析工具而开发出来,在金融领域被广泛使用。Pandas纳入了大量库和一些标准的数据模型,提供了高效地操作大型数据集所需的工具、函数和方法。
SELECT date, open, high, low, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000005.SZA'
AND date BETWEEN '2017-01-06' AND '2017-02-10'
ORDER BY date;
# 导入库
import pandas as pd
import numpy as np
#Pandas数据结构
主要数据结构:Series和DataFrame,Series是一种类似于一维数组的对象,它由一组数据以及一组与之相关的数据标签(即索引)组成。DataFrame是一个表格型 的数据结构,可以简单地理解为是由多个具有相同的索引的Series组成。DataFrame是金融数据分析中特别常用的数据结构,也是我们平台最常遇到的数据结构。
# 创建Series,以日期为索引
dates = pd.date_range('2/6/2017', periods=5, freq='D')
s = pd.Series([1,3,5,np.nan,6,],index=dates)
s
2017-02-06 1.0
2017-02-07 3.0
2017-02-08 5.0
2017-02-09 NaN
2017-02-10 6.0
Freq: D, dtype: float64
# 创建DataFrame,以日期为索引
df = pd.DataFrame(np.random.randn(5,4), index=dates, columns=list('ABCD'))
df
A | B | C | D | |
---|---|---|---|---|
2017-02-06 | -1.738581 | -1.750260 | -0.532282 | -0.409738 |
2017-02-07 | -0.871625 | 0.188268 | -0.689427 | 1.128257 |
2017-02-08 | 1.504632 | -0.229489 | -2.775763 | 1.867307 |
2017-02-09 | -1.753426 | 1.252952 | 0.053105 | 0.870436 |
2017-02-10 | -1.162485 | -0.376172 | 1.548791 | -0.928139 |
sql=f""
SELECT date, open, high, low, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000005.SZA'
AND date BETWEEN '2017-01-06' AND '2017-01-16'
ORDER BY date;""
# 选择某一列数据,它会返回一个Series,等同于df.close:
df['close']
df
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2017-01-06 | 000005.SZA | 64.502495 | 66.077988 | 64.224464 | 65.151230 |
2017-01-09 | 000005.SZA | 64.502495 | 65.892639 | 64.409821 | 65.521935 |
2017-01-10 | 000005.SZA | 65.521935 | 65.614609 | 64.965874 | 64.965874 |
2017-01-11 | 000005.SZA | 64.965874 | 64.965874 | 63.575737 | 63.575737 |
2017-01-12 | 000005.SZA | 63.205032 | 63.761089 | 62.741653 | 62.927006 |
2017-01-13 | 000005.SZA | 62.927006 | 63.205032 | 61.907570 | 61.907570 |
2017-01-16 | 000005.SZA | 62.185596 | 62.278271 | 56.810387 | 59.683346 |
#查看数据
# 显示DataFrame前几行(默认是5)的数据
df.head()
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2017-01-06 | 000005.SZA | 64.502495 | 66.077988 | 64.224464 | 65.151230 |
2017-01-09 | 000005.SZA | 64.502495 | 65.892639 | 64.409821 | 65.521935 |
2017-01-10 | 000005.SZA | 65.521935 | 65.614609 | 64.965874 | 64.965874 |
2017-01-11 | 000005.SZA | 64.965874 | 64.965874 | 63.575737 | 63.575737 |
2017-01-12 | 000005.SZA | 63.205032 | 63.761089 | 62.741653 | 62.927006 |
# 显示DataFrame后行的数据
df.tail(2)
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2017-01-13 | 000005.SZA | 62.927006 | 63.205032 | 61.907570 | 61.907570 |
2017-01-16 | 000005.SZA | 62.185596 | 62.278271 | 56.810387 | 59.683346 |
#显示索引、列名以及底层的numpy数据
# 数据索引
df.index
# output:
DatetimeIndex(['2017-01-06', '2017-01-09', '2017-01-10', '2017-01-11',
'2017-01-12', '2017-01-13', '2017-01-16'],
dtype='datetime64[ns]', name='date', freq=None)
# 数据列名
df.columns
# output:
Index(['instrument', 'open', 'high', 'low', 'close'], dtype='object')
# 底层numpy数据
df.values
# output:
array([['000005.SZA', 64.50249481201172, 66.07798767089844,
64.2244644165039, 65.15122985839844],
['000005.SZA', 64.50249481201172, 65.89263916015625,
64.40982055664062, 65.52193450927734],
['000005.SZA', 65.52193450927734, 65.61460876464844,
64.96587371826172, 64.96587371826172],
['000005.SZA', 64.96587371826172, 64.96587371826172,
63.57573699951172, 63.57573699951172],
['000005.SZA', 63.20503234863281, 63.76108932495117,
62.74165344238281, 62.927005767822266],
['000005.SZA', 62.927005767822266, 63.20503234863281,
61.907569885253906, 61.907569885253906],
['000005.SZA', 62.18559646606445, 62.27827072143555,
56.810386657714844, 59.683345794677734]], dtype=object)
# 数据类型
df.dtypes
# output:
instrument object
open float32
high float32
low float32
close float32
dtype: object
#转置、排序、偏移
# 转置
df.T
date | 2017-01-06 00: 00: 00 | 2017-01-09 00: 00: 00 | 2017-01-10 00: 00: 00 | 2017-01-11 00: 00: 00 | 2017-01-12 00: 00: 00 | 2017-01-13 00: 00: 00 | 2017-01-16 00: 00: 00 |
---|---|---|---|---|---|---|---|
instrument | 000005.SZA | 000005.SZA | 000005.SZA | 000005.SZA | 000005.SZA | 000005.SZA | 000005.SZA |
open | 64.5025 | 64.5025 | 65.5219 | 64.9659 | 63.205 | 62.927 | 62.1856 |
high | 66.078 | 65.8926 | 65.6146 | 64.9659 | 63.7611 | 63.205 | 62.2783 |
low | 64.2245 | 64.4098 | 64.9659 | 63.5757 | 62.7417 | 61.9076 | 56.8104 |
close | 65.1512 | 65.5219 | 64.9659 | 63.5757 | 62.927 | 61.9076 | 59.6833 |
# 按轴进行排序
df.sort_index(axis=1, ascending=False)
date | open | low | instrument | high | close |
---|---|---|---|---|---|
2017-01-06 | 64.502495 | 64.224464 | 000005.SZA | 66.077988 | 65.151230 |
2017-01-09 | 64.502495 | 64.409821 | 000005.SZA | 65.892639 | 65.521935 |
2017-01-10 | 65.521935 | 64.965874 | 000005.SZA | 65.614609 | 64.965874 |
2017-01-11 | 64.965874 | 63.575737 | 000005.SZA | 64.965874 | 63.575737 |
2017-01-12 | 63.205032 | 62.741653 | 000005.SZA | 63.761089 | 62.927006 |
2017-01-13 | 62.927006 | 61.907570 | 000005.SZA | 63.205032 | 61.907570 |
2017-01-16 | 62.185596 | 56.810387 | 000005.SZA | 62.278271 | 59.683346 |
# 按值进行排序
df.sort_values(by='close',ascending=False) # ascending默认升序,ascending=False降序
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2017-01-09 | 000005.SZA | 64.502495 | 65.892639 | 64.409821 | 65.521935 |
2017-01-06 | 000005.SZA | 64.502495 | 66.077988 | 64.224464 | 65.151230 |
2017-01-10 | 000005.SZA | 65.521935 | 65.614609 | 64.965874 | 64.965874 |
2017-01-11 | 000005.SZA | 64.965874 | 64.965874 | 63.575737 | 63.575737 |
2017-01-12 | 000005.SZA | 63.205032 | 63.761089 | 62.741653 | 62.927006 |
2017-01-13 | 000005.SZA | 62.927006 | 63.205032 | 61.907570 | 61.907570 |
2017-01-16 | 000005.SZA | 62.185596 | 62.278271 | 56.810387 | 59.683346 |
#shift数据偏移(可理解为回溯)
# 利用pct_change计算每日收益
sql=f""SELECT date, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000001.SZA'
AND date BETWEEN '2010-01-01' AND '2016-12-31'
ORDER BY date;""
rets.head()
date
2010-01-04 NaN
2010-01-05 -0.017292
2010-01-06 -0.017167
2010-01-07 -0.010917
2010-01-08 -0.002208
Name: close, dtype: float32
# 计算每日收益,等价于pct_change函数
daily_returns = df / df.shift(1) -1 # 正数表示向前偏移,负数表示向后偏移
daily_returns.head()
date
2010-01-04 NaN
2010-01-05 -0.017292
2010-01-06 -0.017167
2010-01-07 -0.010917
2010-01-08 -0.002208
Name: close, dtype: float32
#数据选择
选取
sql=""WITH df1 AS(
SELECT date, open, high, low, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000005.SZA'
AND date BETWEEN '2017-01-06' AND '2017-02-10'
ORDER BY date;)""
df = df.set_index('date')
# 选择某一列数据,它会返回一个Series,等同于df.close:
df['close']
date
2017-01-06 65.151230
2017-01-09 65.521935
2017-01-10 64.965874
2017-01-11 63.575737
2017-01-12 62.927006
2017-01-13 61.907570
2017-01-16 59.683346
2017-01-17 60.054047
2017-01-18 60.702782
2017-01-19 59.683346
2017-01-20 60.239399
2017-01-23 60.517429
2017-01-24 60.332077
2017-01-25 60.424751
2017-01-26 60.795456
2017-02-03 60.517429
2017-02-06 60.795456
2017-02-07 61.166161
2017-02-08 61.073483
2017-02-09 61.444187
2017-02-10 61.536865
Name: close, dtype: float32
# 获取2017-02-06到2017-02-09的所有数据
df.loc['20170206':'20170209']
date instrument open high low close
2017-02-06 000005.SZA 60.517429 60.795456 60.146725 60.795456
2017-02-07 000005.SZA 60.795456 61.629539 60.424751 61.166161
2017-02-08 000005.SZA 60.980808 61.166161 60.424751 61.073483
2017-02-09 000005.SZA 61.073483 61.536865 60.888130 61.444187
# 获取2017-02-06的所有数据
df.loc['2017-02-06':'2017-02-06']
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2017-02-06 | 000005.SZA | 60.517429 | 60.795456 | 60.146725 | 60.795456 |
# 获取2017-02-06的开盘价和收盘价数据
df.loc['2017-02-06':'2017-02-06'][['open','close']]
date | open | close |
---|---|---|
2017-02-06 | 60.517429 | 60.795456 |
# 获取2017-02-06和2017-02-09间断两天的开盘价和收盘价数据
df.loc['2017-02-06':'2017-02-09'][['open','close']]
date | open | close |
---|---|---|
2017-02-06 | 60.517429 | 60.795456 |
2017-02-07 | 60.795456 | 61.166161 |
2017-02-08 | 60.980808 | 61.073483 |
2017-02-09 | 61.073483 | 61.444187 |
# 获取2017-02-06的开盘价,返回DataFrame
df.loc['2017-02-06':'2017-02-06'][['open']]
date open
2017-02-06 60.517429
# 获取2017-02-06的收盘价,返回标量
df.loc['2017-02-06','close']
60.517429
#布尔索引
# 获取最高价大于64的交易日数据
df[df.high > 64]
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2017-01-06 | 000005.SZA | 64.502495 | 66.077988 | 64.224464 | 65.151230 |
2017-01-09 | 000005.SZA | 64.502495 | 65.892639 | 64.409821 | 65.521935 |
2017-01-10 | 000005.SZA | 65.521935 | 65.614609 | 64.965874 | 64.965874 |
2017-01-11 | 000005.SZA | 64.965874 | 64.965874 | 63.575737 | 63.575737 |
2017-01-18 | 000005.SZA | 60.888130 | 64.687851 | 60.517429 | 60.702782 |
#where 索引
# 选出大于60的元素
df[df > 60]
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2017-01-06 | 000005.SZA | 64.502495 | 66.077988 | 64.224464 | 65.151230 |
2017-01-09 | 000005.SZA | 64.502495 | 65.892639 | 64.409821 | 65.521935 |
2017-01-10 | 000005.SZA | 65.521935 | 65.614609 | 64.965874 | 64.965874 |
2017-01-11 | 000005.SZA | 64.965874 | 64.965874 | 63.575737 | 63.575737 |
2017-01-12 | 000005.SZA | 63.205032 | 63.761089 | 62.741653 | 62.927006 |
2017-01-13 | 000005.SZA | 62.927006 | 63.205032 | 61.907570 | 61.907570 |
2017-01-16 | 000005.SZA | 62.185596 | 62.278271 | NaN | NaN |
2017-01-17 | 000005.SZA | NaN | 60.424751 | NaN | 60.054047 |
2017-01-18 | 000005.SZA | 60.888130 | 64.687851 | 60.517429 | 60.702782 |
2017-01-19 | 000005.SZA | NaN | 60.980808 | NaN | NaN |
2017-01-20 | 000005.SZA | 60.239399 | 60.610104 | NaN | 60.239399 |
2017-01-23 | 000005.SZA | 60.332077 | 60.888130 | 60.146725 | 60.517429 |
2017-01-24 | 000005.SZA | 60.610104 | 60.610104 | 60.239399 | 60.332077 |
2017-01-25 | 000005.SZA | 60.424751 | 60.517429 | NaN | 60.424751 |
2017-01-26 | 000005.SZA | 60.517429 | 60.795456 | 60.424751 | 60.795456 |
2017-02-03 | 000005.SZA | 60.888130 | 60.888130 | 60.239399 | 60.517429 |
2017-02-06 | 000005.SZA | 60.517429 | 60.795456 | 60.146725 | 60.795456 |
2017-02-07 | 000005.SZA | 60.795456 | 61.629539 | 60.424751 | 61.166161 |
2017-02-08 | 000005.SZA | 60.980808 | 61.166161 | 60.424751 | 61.073483 |
2017-02-09 | 000005.SZA | 61.073483 | 61.536865 | 60.888130 | 61.444187 |
2017-02-10 | 000005.SZA | 61.629539 | 61.629539 | 61.258835 | 61.536865 |
#丢弃数据
# 丢弃行
df.drop([pd.Timestamp('2017-01-06')]) # 丢弃行
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2017-01-09 | 000005.SZA | 64.502495 | 65.892639 | 64.409821 | 65.521935 |
2017-01-10 | 000005.SZA | 65.521935 | 65.614609 | 64.965874 | 64.965874 |
2017-01-11 | 000005.SZA | 64.965874 | 64.965874 | 63.575737 | 63.575737 |
2017-01-12 | 000005.SZA | 63.205032 | 63.761089 | 62.741653 | 62.927006 |
2017-01-13 | 000005.SZA | 62.927006 | 63.205032 | 61.907570 | 61.907570 |
2017-01-16 | 000005.SZA | 62.185596 | 62.278271 | 56.810387 | 59.683346 |
2017-01-17 | 000005.SZA | 59.497993 | 60.424751 | 58.385880 | 60.054047 |
2017-01-18 | 000005.SZA | 60.888130 | 64.687851 | 60.517429 | 60.702782 |
2017-01-19 | 000005.SZA | 59.776020 | 60.980808 | 59.497993 | 59.683346 |
2017-01-20 | 000005.SZA | 60.239399 | 60.610104 | 59.776020 | 60.239399 |
2017-01-23 | 000005.SZA | 60.332077 | 60.888130 | 60.146725 | 60.517429 |
2017-01-24 | 000005.SZA | 60.610104 | 60.610104 | 60.239399 | 60.332077 |
2017-01-25 | 000005.SZA | 60.424751 | 60.517429 | 59.961372 | 60.424751 |
2017-01-26 | 000005.SZA | 60.517429 | 60.795456 | 60.424751 | 60.795456 |
2017-02-03 | 000005.SZA | 60.888130 | 60.888130 | 60.239399 | 60.517429 |
2017-02-06 | 000005.SZA | 60.517429 | 60.795456 | 60.146725 | 60.795456 |
2017-02-07 | 000005.SZA | 60.795456 | 61.629539 | 60.424751 | 61.166161 |
2017-02-08 | 000005.SZA | 60.980808 | 61.166161 | 60.424751 | 61.073483 |
2017-02-09 | 000005.SZA | 61.073483 | 61.536865 | 60.888130 | 61.444187 |
2017-02-10 | 000005.SZA | 61.629539 | 61.629539 | 61.258835 | 61.536865 |
# 丢弃列 ,del df['open'],原地修改
df.drop('open',axis=1) # 非原地修改
date | instrument | high | low | close |
---|---|---|---|---|
2017-01-06 | 000005.SZA | 66.077988 | 64.224464 | 65.151230 |
2017-01-09 | 000005.SZA | 65.892639 | 64.409821 | 65.521935 |
2017-01-10 | 000005.SZA | 65.614609 | 64.965874 | 64.965874 |
2017-01-11 | 000005.SZA | 64.965874 | 63.575737 | 63.575737 |
2017-01-12 | 000005.SZA | 63.761089 | 62.741653 | 62.927006 |
2017-01-13 | 000005.SZA | 63.205032 | 61.907570 | 61.907570 |
2017-01-16 | 000005.SZA | 62.278271 | 56.810387 | 59.683346 |
2017-01-17 | 000005.SZA | 60.424751 | 58.385880 | 60.054047 |
2017-01-18 | 000005.SZA | 64.687851 | 60.517429 | 60.702782 |
2017-01-19 | 000005.SZA | 60.980808 | 59.497993 | 59.683346 |
2017-01-20 | 000005.SZA | 60.610104 | 59.776020 | 60.239399 |
2017-01-23 | 000005.SZA | 60.888130 | 60.146725 | 60.517429 |
2017-01-24 | 000005.SZA | 60.610104 | 60.239399 | 60.332077 |
2017-01-25 | 000005.SZA | 60.517429 | 59.961372 | 60.424751 |
2017-01-26 | 000005.SZA | 60.795456 | 60.424751 | 60.795456 |
2017-02-03 | 000005.SZA | 60.888130 | 60.239399 | 60.517429 |
2017-02-06 | 000005.SZA | 60.795456 | 60.146725 | 60.795456 |
2017-02-07 | 000005.SZA | 61.629539 | 60.424751 | 61.166161 |
2017-02-08 | 000005.SZA | 61.166161 | 60.424751 | 61.073483 |
2017-02-09 | 000005.SZA | 61.536865 | 60.888130 | 61.444187 |
2017-02-10 | 000005.SZA | 61.629539 | 61.258835 | 61.536865 |
#对齐、合并、赋值
sql=""WITH df1 AS (
SELECT
date,
open,
high,
low,
close
FROM
bar1d_CN_STOCK_A
WHERE
instrument = '000005.SZA'
AND date BETWEEN '2017-02-01' AND '2017-02-10'
) , df2 AS (
SELECT
date,
market_cap,
pe_ttm
FROM
market_value_CN_STOCK_A
WHERE
instrument = '000005.SZA'
AND date BETWEEN '2017-01-01' AND '2017-02-10'
)""
# 取索引的交集进行合并
df1.merge(df2,left_index=True,right_index=True,how='inner')
date | instrument_x | open | high | low | close | instrument_y | market_cap | pe_ttm |
---|---|---|---|---|---|---|---|---|
2017-02-03 | 000005.SZA | 60.888130 | 60.888130 | 60.239399 | 60.517429 | 000005.SZA | 6.912246e+09 | 63.786068 |
2017-02-06 | 000005.SZA | 60.517429 | 60.795456 | 60.146725 | 60.795456 | 000005.SZA | 6.944002e+09 | 64.079109 |
2017-02-07 | 000005.SZA | 60.795456 | 61.629539 | 60.424751 | 61.166161 | 000005.SZA | 6.986343e+09 | 64.469841 |
2017-02-08 | 000005.SZA | 60.980808 | 61.166161 | 60.424751 | 61.073483 | 000005.SZA | 6.975758e+09 | 64.372154 |
2017-02-09 | 000005.SZA | 61.073483 | 61.536865 | 60.888130 | 61.444187 | 000005.SZA | 7.018099e+09 | 64.762886 |
2017-02-10 | 000005.SZA | 61.629539 | 61.629539 | 61.258835 | 61.536865 | 000005.SZA | 7.028685e+09 | 64.860565 |
# 取索引的并集进行合并
df1.merge(df2,left_index=True,right_index=True,how='outer')
date | instrument_x | open | high | low | close | instrument_y | market_cap | pe_ttm |
---|---|---|---|---|---|---|---|---|
2017-01-03 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 7.229807e+09 | 66.716515 |
2017-01-04 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 7.420343e+09 | 68.474785 |
2017-01-05 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 7.378002e+09 | 68.084053 |
2017-01-06 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 7.441514e+09 | 68.670143 |
2017-01-09 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 7.483855e+09 | 69.060875 |
2017-01-10 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 7.420343e+09 | 68.474785 |
2017-01-11 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 7.261563e+09 | 67.009560 |
2017-01-12 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 7.187465e+09 | 66.325790 |
2017-01-13 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 7.071026e+09 | 65.251289 |
2017-01-16 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 6.816977e+09 | 62.906933 |
2017-01-17 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 6.859319e+09 | 63.297661 |
2017-01-18 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 6.933416e+09 | 63.981430 |
2017-01-19 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 6.816977e+09 | 62.906933 |
2017-01-20 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 6.880489e+09 | 63.493023 |
2017-01-23 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 6.912246e+09 | 63.786068 |
2017-01-24 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 6.891075e+09 | 63.590702 |
2017-01-25 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 6.901660e+09 | 63.688385 |
2017-01-26 | NaN | NaN | NaN | NaN | NaN | 000005.SZA | 6.944002e+09 | 64.079109 |
2017-02-03 | 000005.SZA | 60.888130 | 60.888130 | 60.239399 | 60.517429 | 000005.SZA | 6.912246e+09 | 63.786068 |
2017-02-06 | 000005.SZA | 60.517429 | 60.795456 | 60.146725 | 60.795456 | 000005.SZA | 6.944002e+09 | 64.079109 |
2017-02-07 | 000005.SZA | 60.795456 | 61.629539 | 60.424751 | 61.166161 | 000005.SZA | 6.986343e+09 | 64.469841 |
2017-02-08 | 000005.SZA | 60.980808 | 61.166161 | 60.424751 | 61.073483 | 000005.SZA | 6.975758e+09 | 64.372154 |
2017-02-09 | 000005.SZA | 61.073483 | 61.536865 | 60.888130 | 61.444187 | 000005.SZA | 7.018099e+09 | 64.762886 |
2017-02-10 | 000005.SZA | 61.629539 | 61.629539 | 61.258835 | 61.536865 | 000005.SZA | 7.028685e+09 | 64.860565 |
# 通过标签赋值
df1.loc['2017-02-03','market_cap'] = 0
date | instrument | open | high | low | close | market_cap |
---|---|---|---|---|---|---|
2017-02-03 | 000005.SZA | 60.888130 | 60.888130 | 60.239399 | 60.517429 | 0.0 |
2017-02-06 | 000005.SZA | 60.517429 | 60.795456 | 60.146725 | 60.795456 | NaN |
2017-02-07 | 000005.SZA | 60.795456 | 61.629539 | 60.424751 | 61.166161 | NaN |
2017-02-08 | 000005.SZA | 60.980808 | 61.166161 | 60.424751 | 61.073483 | NaN |
2017-02-09 | 000005.SZA | 61.073483 | 61.536865 | 60.888130 | 61.444187 | NaN |
2017-02-10 | 000005.SZA | 61.629539 | 61.629539 | 61.258835 | 61.536865 | NaN |
# 通过位置赋值
df1.iat[0,0] ='600519.SHA'
df1
date | instrument | open | high | low | close | market_cap |
---|---|---|---|---|---|---|
2017-02-03 | 600519.SHA | 60.888130 | 60.888130 | 60.239399 | 60.517429 | 0.0 |
2017-02-06 | 000005.SZA | 60.517429 | 60.795456 | 60.146725 | 60.795456 | NaN |
2017-02-07 | 000005.SZA | 60.795456 | 61.629539 | 60.424751 | 61.166161 | NaN |
2017-02-08 | 000005.SZA | 60.980808 | 61.166161 | 60.424751 | 61.073483 | NaN |
2017-02-09 | 000005.SZA | 61.073483 | 61.536865 | 60.888130 | 61.444187 | NaN |
2017-02-10 | 000005.SZA | 61.629539 | 61.629539 | 61.258835 | 61.536865 | NaN |
# 通过where操作来赋值
df1[df1['market_cap'] == 0] = 111
df1.head(4)
date | instrument | open | high | low | close | market_cap |
---|---|---|---|---|---|---|
2017-02-03 | 111 | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.0 |
2017-02-06 | 000005.SZA | 60.517429 | 60.795456 | 60.146725 | 60.795456 | NaN |
2017-02-07 | 000005.SZA | 60.795456 | 61.629539 | 60.424751 | 61.166161 | NaN |
2017-02-08 | 000005.SZA | 60.980808 | 61.166161 | 60.424751 | 61.073483 | NaN |
#缺失值处理
在pandas中,用np.nan来代表缺失值,这些值默认不会参与运算。
# 通过set_index设置某一列为索引
sql=""WITH df2 AS(
SELECT date, open, high, low, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000005.SZA'
AND date BETWEEN '2016-11-15' AND '2016-12-01'
ORDER BY date;)""
df2
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2016-11-15 | 000005.SZA | 69.785027 | 70.619110 | 69.507004 | 70.526436 |
2016-11-16 | 000005.SZA | 71.360519 | 71.545876 | 69.785027 | 70.155731 |
2016-11-17 | 000005.SZA | 69.970383 | 70.155731 | 69.136299 | 69.970383 |
2016-11-18 | 000005.SZA | 69.970383 | 70.433762 | 69.321648 | 70.063057 |
2016-11-21 | 000005.SZA | 69.970383 | 70.711784 | 69.692352 | 69.877701 |
2016-11-22 | 000005.SZA | 69.970383 | 70.619110 | 69.692352 | 70.341087 |
2016-11-23 | 000005.SZA | 70.433762 | 70.897141 | 69.785027 | 70.526436 |
2016-11-24 | 000005.SZA | 70.155731 | 70.619110 | 69.507004 | 69.970383 |
2016-11-25 | 000005.SZA | 69.970383 | 70.433762 | 68.672913 | 70.063057 |
2016-11-28 | 000005.SZA | 70.433762 | 71.638550 | 69.785027 | 70.711784 |
2016-11-29 | 000005.SZA | 70.433762 | 71.082489 | 69.228973 | 69.321648 |
2016-11-30 | 000005.SZA | 69.043617 | 69.599678 | 68.580238 | 68.765594 |
2016-12-01 | 000005.SZA | 68.950943 | 69.321648 | 68.672913 | 69.043617 |
# 通过resset_index来将DataFrame的索引转化为列
df2.reset_index()
date | instrument | open | high | low | close | |
---|---|---|---|---|---|---|
0 | 2016-11-15 | 000005.SZA | 69.785027 | 70.619110 | 69.507004 | 70.526436 |
1 | 2016-11-16 | 000005.SZA | 71.360519 | 71.545876 | 69.785027 | 70.155731 |
2 | 2016-11-17 | 000005.SZA | 69.970383 | 70.155731 | 69.136299 | 69.970383 |
3 | 2016-11-18 | 000005.SZA | 69.970383 | 70.433762 | 69.321648 | 70.063057 |
4 | 2016-11-21 | 000005.SZA | 69.970383 | 70.711784 | 69.692352 | 69.877701 |
5 | 2016-11-22 | 000005.SZA | 69.970383 | 70.619110 | 69.692352 | 70.341087 |
6 | 2016-11-23 | 000005.SZA | 70.433762 | 70.897141 | 69.785027 | 70.526436 |
7 | 2016-11-24 | 000005.SZA | 70.155731 | 70.619110 | 69.507004 | 69.970383 |
8 | 2016-11-25 | 000005.SZA | 69.970383 | 70.433762 | 68.672913 | 70.063057 |
9 | 2016-11-28 | 000005.SZA | 70.433762 | 71.638550 | 69.785027 | 70.711784 |
10 | 2016-11-29 | 000005.SZA | 70.433762 | 71.082489 | 69.228973 | 69.321648 |
11 | 2016-11-30 | 000005.SZA | 69.043617 | 69.599678 | 68.580238 | 68.765594 |
12 | 2016-12-01 | 000005.SZA | 68.950943 | 69.321648 | 68.672913 | 69.043617 |
# 将2016-11-22以后的数据用缺失值代替
df2.loc['2016-11-22':,:] = np.nan
df2
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2016-11-15 | 000005.SZA | 69.785027 | 70.619110 | 69.507004 | 70.526436 |
2016-11-16 | 000005.SZA | 71.360519 | 71.545876 | 69.785027 | 70.155731 |
2016-11-17 | 000005.SZA | 69.970383 | 70.155731 | 69.136299 | 69.970383 |
2016-11-18 | 000005.SZA | 69.970383 | 70.433762 | 69.321648 | 70.063057 |
2016-11-21 | 000005.SZA | 69.970383 | 70.711784 | 69.692352 | 69.877701 |
2016-11-22 | NaN | NaN | NaN | NaN | NaN |
2016-11-23 | NaN | NaN | NaN | NaN | NaN |
2016-11-24 | NaN | NaN | NaN | NaN | NaN |
2016-11-25 | NaN | NaN | NaN | NaN | NaN |
2016-11-28 | NaN | NaN | NaN | NaN | NaN |
2016-11-29 | NaN | NaN | NaN | NaN | NaN |
2016-11-30 | NaN | NaN | NaN | NaN | NaN |
2016-12-01 | NaN | NaN | NaN | NaN | NaN |
# 剔除所有包含缺失值的行数据
df2.dropna(how='any')
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2016-11-15 | 000005.SZA | 69.785027 | 70.619110 | 69.507004 | 70.526436 |
2016-11-16 | 000005.SZA | 71.360519 | 71.545876 | 69.785027 | 70.155731 |
2016-11-17 | 000005.SZA | 69.970383 | 70.155731 | 69.136299 | 69.970383 |
2016-11-18 | 000005.SZA | 69.970383 | 70.433762 | 69.321648 | 70.063057 |
2016-11-21 | 000005.SZA | 69.970383 | 70.711784 | 69.692352 | 69.877701 |
# 填充缺失值
df2.fillna(value=154)
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2016-11-15 | 000005.SZA | 69.785027 | 70.619110 | 69.507004 | 70.526436 |
2016-11-16 | 000005.SZA | 71.360519 | 71.545876 | 69.785027 | 70.155731 |
2016-11-17 | 000005.SZA | 69.970383 | 70.155731 | 69.136299 | 69.970383 |
2016-11-18 | 000005.SZA | 69.970383 | 70.433762 | 69.321648 | 70.063057 |
2016-11-21 | 000005.SZA | 69.970383 | 70.711784 | 69.692352 | 69.877701 |
2016-11-22 | 154 | 154.000000 | 154.000000 | 154.000000 | 154.000000 |
2016-11-23 | 154 | 154.000000 | 154.000000 | 154.000000 | 154.000000 |
2016-11-24 | 154 | 154.000000 | 154.000000 | 154.000000 | 154.000000 |
2016-11-25 | 154 | 154.000000 | 154.000000 | 154.000000 | 154.000000 |
2016-11-28 | 154 | 154.000000 | 154.000000 | 154.000000 | 154.000000 |
2016-11-29 | 154 | 154.000000 | 154.000000 | 154.000000 | 154.000000 |
2016-11-30 | 154 | 154.000000 | 154.000000 | 154.000000 | 154.000000 |
2016-12-01 | 154 | 154.000000 | 154.000000 | 154.000000 | 154.000000 |
# 获取值是否为nan的布尔标记
pd.isnull(df2)
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2016-11-15 | False | False | False | False | False |
2016-11-16 | False | False | False | False | False |
2016-11-17 | False | False | False | False | False |
2016-11-18 | False | False | False | False | False |
2016-11-21 | False | False | False | False | False |
2016-11-22 | True | True | True | True | True |
2016-11-23 | True | True | True | True | True |
2016-11-24 | True | True | True | True | True |
2016-11-25 | True | True | True | True | True |
2016-11-28 | True | True | True | True | True |
2016-11-29 | True | True | True | True | True |
2016-11-30 | True | True | True | True | True |
2016-12-01 | True | True | True | True | True |
#运算
# describe()能对数据做一个快速统计汇总
df2.describe()
open | high | low | close | |
---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 70.211342 | 70.693253 | 69.488464 | 70.118668 |
std | 0.647406 | 0.521793 | 0.265382 | 0.250397 |
min | 69.785027 | 70.155731 | 69.136299 | 69.877701 |
25% | 69.970383 | 70.433762 | 69.321648 | 69.970383 |
50% | 69.970383 | 70.619110 | 69.507004 | 70.063057 |
75% | 69.970383 | 70.711784 | 69.692352 | 70.155731 |
max | 71.360519 | 71.545876 | 69.785027 | 70.526436 |
运算过程中,通常不包含缺失值。
# 按列计算平均值
df2.mean()
open 70.211342
high 70.693253
low 69.488464
close 70.118668
dtype: float32
# 按行计算平均值
df2.mean(1)
date
2016-11-15 70.109398
2016-11-16 70.711792
2016-11-17 69.808197
2016-11-18 69.947212
2016-11-21 70.063049
2016-11-22 NaN
2016-11-23 NaN
2016-11-24 NaN
2016-11-25 NaN
2016-11-28 NaN
2016-11-29 NaN
2016-11-30 NaN
2016-12-01 NaN
dtype: float32
# pandas会按照指定的索引对齐
dates = ['2016-11-15','2016-11-16','2016-11-17','2016-11-18','2016-11-23']
for i in range(len(dates)):
dates[i] = pd.Timestamp(dates[i])
pd.DataFrame(df2,index=dates)
instrument | open | high | low | close | |
---|---|---|---|---|---|
2016-11-15 | 000005.SZA | 69.785027 | 70.619110 | 69.507004 | 70.526436 |
2016-11-16 | 000005.SZA | 71.360519 | 71.545876 | 69.785027 | 70.155731 |
2016-11-17 | 000005.SZA | 69.970383 | 70.155731 | 69.136299 | 69.970383 |
2016-11-18 | 000005.SZA | 69.970383 | 70.433762 | 69.321648 | 70.063057 |
2016-11-23 | NaN | NaN | NaN | NaN | NaN |
Apply 函数
# 通过apply函数求每个交易日高开低收价格的平均值
df3 = df2[['open','high','low','close']]
df3.apply(np.mean,axis=1)
date
2016-11-15 70.109398
2016-11-16 70.711792
2016-11-17 69.808197
2016-11-18 69.947212
2016-11-21 70.063049
2016-11-22 NaN
2016-11-23 NaN
2016-11-24 NaN
2016-11-25 NaN
2016-11-28 NaN
2016-11-29 NaN
2016-11-30 NaN
2016-12-01 NaN
dtype: float64
# 计算每日股价振幅,振幅=最高价-最低价
df3.apply(lambda x:x.max()-x.min())
open 1.575493
high 1.390144
low 0.648727
close 0.648735
dtype: float64
# 通过apply对各个元素调用字符串方法,比如将股票代码的后缀改成小写
temp = df2.ix['2016-11-15':'2016-11-21']
temp['instrument'].apply(lambda x : x.lower())
date
2016-11-15 000005.sza
2016-11-16 000005.sza
2016-11-17 000005.sza
2016-11-18 000005.sza
2016-11-21 000005.sza
Name: instrument,
dtype: object
rolling_apply的使用
# 获取数据
sql=""WITH data AS(
SELECT date, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000001.SZA'
AND date BETWEEN '2016-01-01' AND '2016-10-01'
ORDER BY date;)""
# 计算5日均线和20日均线
data['MA_' + str(5)] = data['close'].rolling(5).mean()
data['MA_' + str(20)] = data['close'].rolling(20).mean()
data.head(12)
instrument | close | MA_5 | MA_20 | |
---|---|---|---|---|
date | ||||
2016-01-04 | 000001.SZA | 974.312988 | NaN | NaN |
2016-01-05 | 000001.SZA | 980.332520 | NaN | NaN |
2016-01-06 | 000001.SZA | 991.511780 | NaN | NaN |
2016-01-07 | 000001.SZA | 940.775269 | NaN | NaN |
2016-01-08 | 000001.SZA | 956.254211 | 968.637354 | NaN |
2016-01-11 | 000001.SZA | 925.296326 | 958.834021 | NaN |
2016-01-12 | 000001.SZA | 929.596008 | 948.686719 | NaN |
2016-01-13 | 000001.SZA | 920.996643 | 934.583691 | NaN |
2016-01-14 | 000001.SZA | 926.156250 | 931.659888 | NaN |
2016-01-15 | 000001.SZA | 899.498108 | 920.308667 | NaN |
2016-01-18 | 000001.SZA | 895.198425 | 914.289087 | NaN |
2016-01-19 | 000001.SZA | 920.996643 | 912.569214 | NaN |
#合并
Concat 连接
# 通过concat()来连接pandas对象
a = df2.loc['2016-11-15':'2016-11-16']
b = df2.loc['2016-11-20':'2016-11-26']
pd.concat([a,b])
date | instrument | open | high | low | close |
---|---|---|---|---|---|
2016-11-15 | 000005.SZA | 69.785027 | 70.619110 | 69.507004 | 70.526436 |
2016-11-16 | 000005.SZA | 71.360519 | 71.545876 | 69.785027 | 70.155731 |
2016-11-21 | 000005.SZA | 69.970383 | 70.711784 | 69.692352 | 69.877701 |
2016-11-22 | NaN | NaN | NaN | NaN | NaN |
2016-11-23 | NaN | NaN | NaN | NaN | NaN |
2016-11-24 | NaN | NaN | NaN | NaN | NaN |
2016-11-25 | NaN | NaN | NaN | NaN | NaN |
Join 合并
# 合并,和merge功能类似
sql=""WITH df5 AS (
SELECT
date,
open,
high,
low,
close
FROM
bar1d_CN_STOCK_A
WHERE
instrument = '000005.SZA'
AND date BETWEEN '2017-02-01' AND '2017-02-10'
) , df6 AS (
SELECT
date,
market_cap,
pe_ttm
FROM
market_value_CN_STOCK_A
WHERE
instrument = '000005.SZA'
AND date BETWEEN '2017-01-01' AND '2017-02-10'
)""
# 删除股票代码列
del df5['instrument']
del df6['instrument']
# 按日期轴合并数据
df5.join(df6)
date | open | high | low | close | market_cap | pe_ttm |
---|---|---|---|---|---|---|
2017-02-03 | 60.888130 | 60.888130 | 60.239399 | 60.517429 | 6.912246e+09 | 63.786068 |
2017-02-06 | 60.517429 | 60.795456 | 60.146725 | 60.795456 | 6.944002e+09 | 64.079109 |
2017-02-07 | 60.795456 | 61.629539 | 60.424751 | 61.166161 | 6.986343e+09 | 64.469841 |
2017-02-08 | 60.980808 | 61.166161 | 60.424751 | 61.073483 | 6.975758e+09 | 64.372154 |
2017-02-09 | 61.073483 | 61.536865 | 60.888130 | 61.444187 | 7.018099e+09 | 64.762886 |
2017-02-10 | 61.629539 | 61.629539 | 61.258835 | 61.536865 | 7.028685e+09 | 64.860565 |
Append 添加
# 通过append方法,将若干行添加到dataFrame后面
c = df1.loc['2017-02-03':'2017-02-06']
d = df1.loc['2017-02-08':'2017-02-10']
c.append(d)
date | instrument | open | high | low | close | market_cap |
---|---|---|---|---|---|---|
2017-02-03 | 111 | 111.000000 | 111.000000 | 111.000000 | 111.000000 | 111.0 |
2017-02-06 | 000005.SZA | 60.517429 | 60.795456 | 60.146725 | 60.795456 | NaN |
2017-02-08 | 000005.SZA | 60.980808 | 61.166161 | 60.424751 | 61.073483 | NaN |
2017-02-09 | 000005.SZA | 61.073483 | 61.536865 | 60.888130 | 61.444187 | NaN |
2017-02-10 | 000005.SZA | 61.629539 | 61.629539 | 61.258835 | 61.536865 | NaN |
数据透视表
也许大多数人都有在Excel中使用数据透视表的经历,其实Pandas也提供了一个类似的功能,名为pivot_table
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
'B' : ['A', 'B', 'C'] * 4,
'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
'D' : np.random.randn(12),
'E' : np.random.randn(12)})
df
A | B | C | D | E | |
---|---|---|---|---|---|
0 | one | A | foo | -2.216199 | 0.288395 |
1 | one | B | foo | 1.572496 | 0.159725 |
2 | two | C | foo | -0.559229 | 0.204685 |
3 | three | A | bar | 2.044115 | -0.032094 |
4 | one | B | bar | 1.022778 | 1.650399 |
5 | one | C | bar | -0.558943 | 1.074547 |
6 | two | A | foo | -0.147329 | 0.150598 |
7 | three | B | foo | 0.659760 | 1.779248 |
8 | one | C | foo | 0.506593 | -0.343586 |
9 | one | A | bar | 0.941787 | -0.277070 |
10 | two | B | bar | 1.662566 | -0.558867 |
11 | three | C | bar | -1.671826 | -0.939832 |
# 我们可以轻松得到数据透视表
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
| bar | foo | |
---|---|---|---|
A | B | ||
one | A | 0.941787 | -2.216199 |
B | 1.022778 | 1.572496 | |
C | -0.558943 | 0.506593 | |
three | A | 2.044115 | NaN |
B | NaN | 0.659760 | |
C | -1.671826 | NaN | |
two | A | NaN | -0.147329 |
B | 1.662566 | NaN | |
C | NaN | -0.559229 |
# 查看每个交易日多只股票的收盘价数据(每列为股票)
df = D.history_data(['000001.SZA','000002.SZA'],'2016-12-15','2017-01-01',fields=['close'],groupped_by_instrument=False)
pd.pivot_table(df,values='close',index=['date'],columns=['instrument'])
instrument | 000001.SZA | 000002.SZA |
---|---|---|
date | ||
2016-12-15 | 969.013855 | 2965.032471 |
2016-12-16 | 969.013855 | 2982.294189 |
2016-12-19 | 963.775940 | 2801.710205 |
2016-12-20 | 954.347656 | 2699.467529 |
2016-12-21 | 959.585571 | 2719.385010 |
2016-12-22 | 957.490417 | 2736.646729 |
2016-12-23 | 951.204956 | 2695.484131 |
2016-12-26 | 955.395264 | 2741.958008 |
2016-12-27 | 951.204956 | 2844.200439 |
2016-12-28 | 949.109802 | 2814.988281 |
2016-12-29 | 951.204956 | 2767.186768 |
2016-12-30 | 953.300110 | 2728.679688 |
# 查看不同股票每个交易日的收盘价数据(每列为交易日)
pd.pivot_table(df,values = 'close',index=['instrument'],columns=['date'])
date | 2016-12-15 00 :00 :00 | 2016-12-16 00 :00 :00 | 2016-12-19 00 :00 :00 | 2016-12-20 00 :00 :00 | 2016-12-21 00 :00 :00 | 2016-12-22 00 :00 :00 | 2016-12-23 00 :00 :00 | 2016-12-26 00 :00 :00 | 2016-12-27 00 :00 :00 | 2016-12-28 00 :00 :00 | 2016-12-29 00 :00 :00 | 2016-12-30 00 :00 :00 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
instrument | ||||||||||||
000001.SZA | 969.013855 | 969.013855 | 963.775940 | 954.347656 | 959.585571 | 957.490417 | 951.204956 | 955.395264 | 951.204956 | 949.109802 | 951.204956 | 953.300110 |
000002.SZA | 2965.032471 | 2982.294189 | 2801.710205 | 2699.467529 | 2719.385010 | 2736.646729 | 2695.484131 | 2741.958008 | 2844.200439 | 2814.988281 | 2767.186768 | 2728.679688 |
#分组
对于“group by”操作,我们通常是指以下一个或几个步骤: * 划分 按照某些标准将数据分为不同的组 * 应用 对每组数据分别执行一个函数 * 组合 将结果组合到一个数据结构
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'bar'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
df
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | -0.160349 | -0.086323 |
1 | bar | one | -0.994189 | -1.444457 |
2 | foo | two | -0.095365 | 0.227690 |
3 | bar | three | 1.502585 | -0.286937 |
4 | foo | two | -0.258329 | -0.123057 |
5 | bar | two | 0.944706 | -0.211243 |
6 | foo | one | 1.542726 | -0.164639 |
7 | bar | three | 0.874764 | -0.075149 |
# 分组并对每个分组应用sum函数
df.groupby('A').sum()
A | C | D |
---|---|---|
bar | 2.327866 | -2.017786 |
foo | 1.028683 | -0.146328 |
# 按多个列分组形成层级索引,然后应用函数
df.groupby(['A','B']).sum()
A | B | C | D |
---|---|---|---|
bar | one | -0.994189 | -1.444457 |
three | 2.377349 | -0.362086 | |
two | 0.944706 | -0.211243 | |
foo | one | 1.382377 | -0.250962 |
two | -0.353694 | 0.104634 |
# 通过groupby函数来计算各股票和大盘指数的相关系数(按年划分)
# 获取数据
stocks = ['000005.SZA','000010.SZA','603998.SHA','603898.SHA']
sql=""WITH df AS(
SELECT instrument, date, close
FROM bar1d_CN_STOCK_A
WHERE instrument IN ('000005.SZA','000010.SZA','603998.SHA','603898.SHA')
AND date BETWEEN '2010-01-01' AND '2016-12-31'
ORDER BY instrument, date;)""
# 每列转化为每日多只股票的收盘价数据
df = pd.pivot_table(df,values='close',index=['date'],columns=['instrument'])
# 计算日收益率
rets = df.pct_change().dropna()
# 定义求相关系数的函数
corr_func = lambda x:x.corrwith(x['000005.SZA'])
# 求每只股票和大盘相关性
by_year = rets.groupby(lambda x:x.year)
by_year.apply(corr_func)
instrument | 000005.SZA | 000010.SZA | 603898.SHA | 603998.SHA |
---|---|---|---|---|
2015 | 1.0 | 0.628609 | 0.178597 | 0.523903 |
2016 | 1.0 | 0.643711 | 0.627512 | 0.548929 |
#时间序列
pandas在对频率转换进行重新采样时拥有着简单,强大而且高效的功能(例如把按秒采样的数据转换为按5分钟采样的数据)。这在金融领域很常见,但又不限于此。
# 按秒产生5000个时间、起始时间是2017-01-01的时间序列
rng = pd.date_range('1/1/2017', periods=5000, freq='S')
rng
DatetimeIndex(['2017-01-01 00 :00 :00 ', '2017-01-01 00 :00 :01 ',
'2017-01-01 00 :00 :02 ', '2017-01-01 00 :00 :03 ',
'2017-01-01 00 :00 :04 ', '2017-01-01 00 :00 :05 ',
'2017-01-01 00 :00 :06 ', '2017-01-01 00 :00 :07 ',
'2017-01-01 00 :00 :08 ', '2017-01-01 00 :00 :09 ',
...
'2017-01-01 01 :23 :10 ', '2017-01-01 01 :23 :11 ',
'2017-01-01 01 :23 :12 ', '2017-01-01 01 :23 :13 ',
'2017-01-01 01 :23 :14 ', '2017-01-01 01 :23 :15 ',
'2017-01-01 01 :23 :16 ', '2017-01-01 01 :23 :17 ',
'2017-01-01 01 :23 :18 ', '2017-01-01 01 :23 :19 '],
dtype='datetime64[ns]', length=5000, freq='S')
# 重采样,计算每五分钟的总和
ts = pd.Series(np.random.randint(0,5000,len(rng)), index=rng)
ts.resample('5Min', how='sum')
2017-01-01 00:00:00 765477
2017-01-01 00:05:00 755782
2017-01-01 00:10:00 759474
2017-01-01 00:15:00 738611
2017-01-01 00:20:00 782165
2017-01-01 00:25:00 741331
2017-01-01 00:30:00 771220
2017-01-01 00:35:00 790782
2017-01-01 00:40:00 739392
2017-01-01 00:45:00 800815
2017-01-01 00:50:00 725954
2017-01-01 00:55:00 725840
2017-01-01 01:00:00 739548
2017-01-01 01:05:00 756409
2017-01-01 01:10:00 727530
2017-01-01 01:15:00 776613
2017-01-01 01:20:00 522169
Freq: 5T, dtype: int64
# 频率转化,将日K线数据装换成周K
sql=""WITH df AS(
SELECT date, close
FROM bar1d_CN_STOCK_A
WHERE instrument = '000001.SZA'
AND date BETWEEN '2016-11-01' AND '2016-12-31'
ORDER BY date;)""
df.resample('W').ohlc()
close | ||||
---|---|---|---|---|
date | open | high | low | close |
2016-11-06 | 957.490417 | 957.490417 | 950.157349 | 954.347656 |
2016-11-13 | 955.395264 | 961.680786 | 950.157349 | 961.680786 |
2016-11-20 | 965.871094 | 966.918701 | 961.680786 | 961.680786 |
2016-11-27 | 967.966248 | 1007.774414 | 967.966248 | 1007.774414 |
2016-12-04 | 1008.821960 | 1008.821960 | 1000.441345 | 1000.441345 |
2016-12-11 | 991.013062 | 1010.917114 | 991.013062 | 1010.917114 |
2016-12-18 | 995.203430 | 995.203430 | 969.013855 | 969.013855 |
2016-12-25 | 963.775940 | 963.775940 | 951.204956 | 951.204956 |
2017-01-01 | 955.395264 | 955.395264 | 949.109802 | 953.300110 |
#绘图
数据可视化:可交互图表 (文档: 数据可视化)
# 绘制沪深300走势图
history_hs300 = DataSource('bar1d_CN_STOCK_A').read(instruments = ['000300.HIX'], start_date='2016-01-01', end_date='2017-01-01', fields=['close'])
T.plot(history_hs300[['date', 'close']].set_index('date'), chart_type='spline')
# 绘制多只股票走势图
sql="" WITH data AS(
SELECT instrument, date, close