Python中的DataFrame模块学

发布时间:2019-09-03 09:13:50编辑:auto阅读(1513)

      本文是基于Windows系统环境,学习和测试DataFrame模块:

      Windows 10

      PyCharm 2018.3.5 for Windows (exe)

      python 3.6.8 Windows x86 executable installer

      1. 初始化DataFrame

      创建一个空的DataFrame变量

      import pandas as pd

      import numpy as np

      data = pd.DataFrame()

      print(np.shape(data)) # (0,0)

      通过字典创建一个DataFrame

      import pandas as pd

      import numpy as np

      dict_a = {'name': ['xu', 'wang'], 'gender': ['male', 'female']}

      data = pd.DataFrame(dict_a)

      print(np.shape(data)) # (2,2)

      print(data)

      # data =

      # name gender

      # 0 xu male

      # 1 wang female

      通过numpy.array创建一个DataFrame

      import pandas as pd

      import numpy as np

      mat = np.random.randn(3,4)

      df = pd.DataFrame(mat)

      df.columns = ['a','b','c','d']

      print(df)

      一个DataFrame转成numpy.array

      import pandas as pd

      import numpy as np

      mat = np.random.randn(3,4)

      df = pd.DataFrame(mat)

      df.columns = ['a','b','c','d']

      print(df)

      n = np.array(df)

      print(n)

      DataFrame增加一列数据

      import pandas as pd

      import numpy as np

      data = pd.DataFrame()

      data['ID'] = range(0,10)

      print(np.shape(data)) # (10,1)

      DataFrame增加一列数据,且值相同

      import pandas as pd

      import numpy as np

      dict_a = {'name': ['xu', 'wang'], 'gender': ['male', 'female']}

      data = pd.DataFrame(dict_a)

      data['country'] = 'China'

      print(data)

      # data =

      # name gender country

      # 0 xu male China

      # 1 wang female China

      DataFrame删除重复的数据行

      import pandas as pd

      norepeat_df = df.drop_duplicates(subset=['A_ID', 'B_ID'], keep='first')

      # norepeat_df = df.drop_duplicates(subset=[1, 2], keep='first')

      # keep=False时,就是去掉所有的重复行

      # keep=‘first'时,就是保留第一次出现的重复行

      # keep='last'时就是保留最后一次出现的重复行。

      2. 基本操作

      去除某一列两端的指定字符

      import pandas as pd

      dict_a = {'name': ['.xu', 'wang'], 'gender': ['male', 'female.']}

      data = pd.DataFrame(dict_a)

      print(data)

      # data =

      # name gender

      # 0 .xu male

      # 1 wang female.

      data['name'] = data['name'].str.strip('.') # 删除'.'

      # data['name'] = data['name'].str.strip() # 删除空格

      print(data)

      # data =

      # name gender

      # 0 xu male

      # 1 wang female.

      重新调整index的值

      import pandas as pd

      data = pd.DataFrame()

      data['ID'] = range(0,3)

      # data =

      # ID

      # 0 0

      # 1 1

      # 2 2

      data.index = range(1,len(data) + 1)

      # data =

      # ID

      # 1 0

      # 2 1

      # 3 2

      调整DataFrame列顺序

      import pandas as pd

      data = pd.DataFrame()

      print(data)

      # data =

      # ID name

      # 0 0 xu

      # 1 1 wang

      # 2 2 li

      data = data[['name','ID']]

      # data =

      # name ID

      # 0 xu 0

      # 1 wang 1

      # 2 li 2无锡人流医院 http://www.bhnfkyy.com/

      获取DataFrame的列名

      import pandas as pd

      data = pd.DataFrame()

      print(data)

      # data =

      # ID name

      # 0 0 xu

      # 1 1 wang

      # 2 2 li

      print(data.columns.values.tolist())

      # ['ID', 'name']

      获取DataFrame的行名

      import pandas as pd

      data = pd.DataFrame()

      print(data)

      # data =

      # ID name

      # 0 0 xu

      # 1 1 wang

      # 2 2 li

      print(data._stat_axis.values.tolist())

      # [0, 1, 2]

      3. 读写操作

      将csv文件读入DataFrame数据

      read_csv()函数的参数配置参考官网pandas.read_csv

      import pandas as pd

      data = pd.read_csv('user.csv')

      print (data)

      将DataFrame数据写入csv文件

      to_csv()函数的参数配置参考官网pandas.DataFrame.to_csv

      import pandas as pd

      data = pd.read_csv('test1.csv')

      data.to_csv("test2.csv",index=False, header=True)

      4. 异常处理

      过滤所有包含NaN的行

      dropna()函数的参数配置参考官网pandas.DataFrame.dropna

      from numpy import nan as NaN

      import pandas as pd

      data = pd.DataFrame([[1,2,3],[NaN,NaN,2],[NaN,NaN,NaN],[8,8,NaN]])

      print (data)

      # data =

      # 1 2 3

      # NaN NaN 2

      # NaN NaN NaN

      # 8 8 NaN

      data = data.dropna()

      # DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

      # axis: 0 or 'index'表示去除行 1 or 'columns'表示去除列

      # how: 'any'表示行或列只要含有NaN就去除,'all'表示行或列全都含有NaN才去除

      # thresh: 整数n,表示每行或列中至少有n个元素补位NaN,否则去除

      # subset: ['name', 'gender'] 在子集中去除NaN值,子集也可以index,但是要配合axis=1

      # inplace: 如何为True,则执行操作,然后返回None

      print(data)

      # data =

      # 1 2 3


关键字