01-pandas高频操作

pandas入门级高频操作

#读取数据
pd.read_csv(file.csv)
#取列
df['col1','col2']
#条件筛选
df[df['col1']=='xx']
#缺失值删除:删除name为空的行
df.dropna(subset = ['name'])
#去重
df.drop_duplicates(subset = ['col1'])
df.duplicated() #布尔值,行是否重复
df['col'].duplicated() #布尔值,列是否重复
#重命名
df.rename(columns = {'col1':'recol1','col2':'recol2'})
#修改列 salary加倍
df['salary'] *= 2
#排序
df_sorted = df.sort_values(by=['col1', 'col2', 'col3'], ascending=[True, False, True])
#分组统计
df.groupby('列名').agg(统计函数)
#修改字段类型 col1改成int类型
df['col1'] = df['col1'].astype('int')
#垂直连结(上下合并)垂直 concat,axis=0,ignore_index=True”
result = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
#行转列 透视
df.pivot(index = x,columns = x,values = x)
eg:df_wide = df_long.pivot(
    index='姓名',      # 行索引
    columns='科目',    # 要展开成列的字段
    values='分数'      # 填入单元格的值
).reset_index()
#列转行 融合
pd.melt(df,id_vars = x,var_name = x,value_name = x)
eg:df_long = pd.melt(df, id_vars=['姓名'], var_name='科目', value_name='分数')
#整体转置 行直接变成列
df_transposed = df.T
#返回前n行
df.head(n)
#标签索引和位置索引
df.loc[df['A'] > 20, 'B']  # A>20 的行,取 B 列
df.iloc[0:3]   # 取前3行 注意:不包含位置 3!

pandas与MYSQL操作类比-基础

1. 查询全部数据

SELECT * FROM table;

df

 

 

df.head()

Pandas DataFrame 默认显示全部(或用

 

.head()

 

查看前几行)

2. 选择特定列

SELECT col1, col2 FROM table;

df[['col1', 'col2']]

注意:单列用

 

df['col']

,多列必须用双括号

3. 条件筛选(WHERE)

SELECT * FROM table WHERE age > 30;

df[df['age'] > 30]

 

df.query('age > 30')

Pandas 支持布尔索引或

 

.query()

 

字符串表达式

4. 多条件筛选

SELECT * FROM table WHERE age > 30 AND city = '北京';

df[(df['age'] > 30) & (df['city'] == '北京')]

 

df.query("age > 30 and city == '北京'")

注意:Pandas 中用

 

&

、`

5. 排序(ORDER BY)

SELECT * FROM table ORDER BY salary DESC;

df.sort_values('salary', ascending=False)

多列排序:

sort_values(['col1', 'col2'], ascending=[True, False])

6. 分组统计(GROUP BY)

SELECT dept, AVG(salary) FROM table GROUP BY dept;

df.groupby('dept')['salary'].mean()

 

df.groupby('dept').agg({'salary': 'mean'})

Pandas 的

 

groupby

 

+ 聚合函数

7. 计数(COUNT)

SELECT COUNT(*) FROM table;

len(df)

 

 

df.shape[0]

行数统计

SELECT dept, COUNT(*) FROM table GROUP BY dept;

df.groupby('dept').size()

分组计数(含 NaN)

 

df.groupby('dept')['id'].count()

(不含 NaN)

8. 去重(DISTINCT)

SELECT DISTINCT city FROM table;

df['city'].unique()

 

df[['city']].drop_duplicates()

单列去重用

 

.unique()

,多列用

 

.drop_duplicates()

9. 限制结果数量(LIMIT)

SELECT * FROM table LIMIT 5;

df.head(5)

取前 N 行

10. 空值处理(IS NULL)

SELECT * FROM table WHERE name IS NULL;

df[df['name'].isnull()]

Pandas 用

 

.isnull()

 

/

 

.isna()

SELECT * FROM table WHERE name IS NOT NULL;

df[df['name'].notnull()]

11. 字符串模糊匹配(LIKE)

SELECT * FROM table WHERE name LIKE '%明%';

df[df['name'].str.contains('明')]

注意:需确保列是字符串类型(

str

 

accessor)

12. 列重命名(AS)

SELECT name AS 姓名 FROM table;

df.rename(columns={'name': '姓名'})

13. 新增计算列

SELECT *, salary * 12 AS annual FROM table;

df['annual'] = df['salary'] * 12

直接赋值创建新列

14. 连接表(JOIN)

SELECT * FROM A JOIN B ON A.id = B.id;

pd.merge(dfA, dfB, on='id')

支持

 

how='inner'/'left'/'right'/'outer'

15. 垂直拼接(UNION ALL)

SELECT * FROM A UNION ALL SELECT * FROM B;

pd.concat([dfA, dfB], ignore_index=True)

UNION

(去重)需额外加

 

.drop_duplicates()

16. 最大/最小值

SELECT MAX(salary) FROM table;

df['salary'].max()

17. 分页(LIMIT OFFSET)

SELECT * FROM table LIMIT 10 OFFSET 20;

df.iloc[20:30]

 

.iloc

 

按位置切片

pandas与MYSQL操作类比-实战

#排序并赋予序号
mysql:row_number() over (partition by xx order by xx) as rn
pandas:df['col'] = range(1,len(df)+1) #从1开始
pandas:df['col'] = range(len(df)) #从0开始
#条件赋值
mysql:if condition then B1 else B2 end as col
pandas:df['col'] = np.where(condition,B1,B2)
#正则/like
mysql:col REGEXP'x[a-zA-Z0-9]*'/col like 'x%'
pandas:df['col'].str.startswith('x') #开头
       df['col'].str.endswith('x') #结尾
       df['col'].str.contains('x') #中间包含
       df['col'].match('x') 
#与、或、非
mysql:and or not
pandas:& | ~ #优先级高于四则运算,要添加好括号
#聚合函数(groupby)(max/min、count()、sum()、avg())
mysql: SELECT col, MAX(val) FROM table GROUP BY col;
pandas: df.groupby('col')['val'].max()

mysql: SELECT col, MIN(val) FROM table GROUP BY col;
pandas: df.groupby('col')['val'].min()

mysql: SELECT col, COUNT(*) FROM table GROUP BY col;
pandas: df.groupby('col').size()

mysql: SELECT col, COUNT(val) FROM table GROUP BY col; (忽略 NULL)
pandas: df.groupby('col')['val'].count()

mysql: SELECT col, SUM(val) FROM table GROUP BY col;
pandas: df.groupby('col')['val'].sum()

mysql: SELECT col, AVG(val) FROM table GROUP BY col;
pandas: df.groupby('col')['val'].mean()
#窗口函数(max/min/count/sum/avg)over (partiton by order by)
SELECT 
    SUM(val) OVER (PARTITION BY col1 ORDER BY col2) AS running_sum,
    AVG(val) OVER (PARTITION BY col1 ORDER BY col2) AS running_avg,
    COUNT(*) OVER (PARTITION BY col1 ORDER BY col2) AS running_count,
    MIN(val) OVER (PARTITION BY col1 ORDER BY col2) AS running_min,
    MAX(val) OVER (PARTITION BY col1 ORDER BY col2) AS running_max
FROM table;
pandas:
# 无需排序,直接分组 transform
g = df.groupby('col1')['val']
df['sum_over']   = g.transform('sum')      # 分区内总和
df['min_over']   = g.transform('min')      # 分区内最小值
df['max_over']   = g.transform('max')      # 分区内最大值
df['count_over'] = g.transform('size')     # 分区内行数(含 NaN)
df['avg_over']   = g.transform('mean')     # 分区内平均值
#需要排序 !!!注意不能直接groupby().sort_values()
df = df.sort_values(['col1', 'col2'])
g = df.groupby('col1')['val']
df['running_sum']   = g.cumsum()               # 累积和
df['running_min']   = g.cummin()               # 累积最小(从开头到当前行的最小值)
df['running_max']   = g.cummax()               # 累积最大
df['running_count'] = g.cumcount() + 1         # 累积计数(从1开始)
df['running_avg']   = g.expanding().mean().reset_index(level=0, drop=True)  # 累积平均
#去重(distinct)
mysql:select distinct col1,col2 from table
pandas:
df['col1'].unique() #单列去重
df[['col1','col2']].drop_duplicates() #多列去重
#表连接(left join/right join/inner join)
mysql:table a left/right/inner join b on a.col = b.col
pandas:pd.merge(a, b, on='col', how='left'/'right'/'inner')
pd.merge(a, b, left_on='id', right_on='customer_id', how='left')#左右表列名不同时
#限制空值(col is null)
mysql:where col is null
pandas:df[df['col'].isnull()]/df[df['col'].isna()]

大数据开发 文章被收录于专栏

大数据开发专栏 新手小白友好

全部评论

相关推荐

评论
点赞
收藏
分享

创作者周榜

更多
牛客网
牛客网在线编程
牛客网题解
牛客企业服务