[29]:
import pandas as pd
import os
# 指定文件路径
file_path = r"C:\Users\ASUS\Desktop\大二下课程\python数据处理\3\iris.txt"
student_id = "2023014925"
# 读取iris.txt文件,该文件没有表头
iris_data = pd.read_csv(file_path, header=None, sep=",")
print("姓名:丁晴")
print("学号:2023014925")
print(iris_data)
# 添加表头信息
headers = ["Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.width", "Species"]
iris_data.columns = headers
# 显示前几行数据,确认读取和添加表头正确
print("姓名:丁晴")
print("学号:2023014925")
print("读取的数据及添加表头后的结果:")
print(iris_data)
# 保存为CSV文件,以学号命名
output_file = os.path.join(os.path.dirname(file_path), f"{student_id}.csv")
iris_data.to_csv(output_file, index=False)
print("姓名:丁晴")
print("学号:2023014925")
print(f"\n数据已保存为: {output_file}")
姓名:丁晴 学号:2023014925 0 1 2 3 4 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa .. ... ... ... ... ... 145 6.7 3.0 5.2 2.3 Iris-virginica 146 6.3 2.5 5.0 1.9 Iris-virginica 147 6.5 3.0 5.2 2.0 Iris-virginica 148 6.2 3.4 5.4 2.3 Iris-virginica 149 5.9 3.0 5.1 1.8 Iris-virginica [150 rows x 5 columns] 姓名:丁晴 学号:2023014925 读取的数据及添加表头后的结果: Sepal.Length Sepal.Width Petal.Length Petal.width Species 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa .. ... ... ... ... ... 145 6.7 3.0 5.2 2.3 Iris-virginica 146 6.3 2.5 5.0 1.9 Iris-virginica 147 6.5 3.0 5.2 2.0 Iris-virginica 148 6.2 3.4 5.4 2.3 Iris-virginica 149 5.9 3.0 5.1 1.8 Iris-virginica [150 rows x 5 columns] 姓名:丁晴 学号:2023014925 数据已保存为: C:\Users\ASUS\Desktop\大二下课程\python数据处理\3\2023014925.csv
[9]:
iris_data.columns = ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
iris_data
[9]:
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | Iris-virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | Iris-virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | Iris-virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | Iris-virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | Iris-virginica |
150 rows × 5 columns
[19]:
iris_data.to_csv('2023014925.csv')
[1]:
file1_path = r"C:\Users\ASUS\Desktop\大二下课程\python数据处理\3\iris_NaN.csv"
file2_path = r"C:\Users\ASUS\Desktop\大二下课程\python数据处理\3\2023014925.csv"
df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)
print("DataFrame 1 columns:", df1.columns)
print("DataFrame 2 columns:", df2.columns)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[1], line 3 1 file1_path = r"C:\Users\ASUS\Desktop\大二下课程\python数据处理\3\iris_NaN.csv" 2 file2_path = r"C:\Users\ASUS\Desktop\大二下课程\python数据处理\3\2023014925.csv" ----> 3 df1 = pd.read_csv(file1_path) 4 df2 = pd.read_csv(file2_path) 5 print("DataFrame 1 columns:", df1.columns) NameError: name 'pd' is not defined
[37]:
import pandas as pd
import os
# 指定文件路径
base_path = r"C:\Users\ASUS\Desktop\大二下课程\python数据处理\3"
student_id = "2023014925"
# 读取第一个文件:自己创建的CSV文件
file1_path = os.path.join(base_path, f"{student_id}.csv")
df1 = pd.read_csv(file1_path)
# 读取第二个文件:iris_NaN.csv
file2_path = os.path.join(base_path, "iris_NaN.csv")
df2 = pd.read_csv(file2_path)
# 显示读取的两个DataFrame的前几行
print("DataFrame 1 (前5行):")
print(df1.head())
print("\nDataFrame 2 (前5行):")
print(df2.head())
# 使用merge方法根据Species列合并两个DataFrame
merged_df = pd.merge(df1, df2, on='Species', how='inner')
print("姓名:丁晴")
print("学号:2023014925")
# 显示合并后的DataFrame前几行
print("合并后的DataFrame :")
print(merged_df)
# 查看合并后DataFrame的形状(行数和列数)
print(f"\n合并后DataFrame的形状: {merged_df.shape}")
DataFrame 1 (前5行): Sepal.Length Sepal.Width Petal.Length Petal.width Species 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa DataFrame 2 (前5行): Feature_1 Feature_2 Feature_3 Feature_4 Species 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa 姓名:丁晴 学号:2023014925 合并后的DataFrame : Sepal.Length Sepal.Width Petal.Length Petal.width Species \ 0 5.1 3.5 1.4 0.2 Iris-setosa 1 5.1 3.5 1.4 0.2 Iris-setosa 2 5.1 3.5 1.4 0.2 Iris-setosa 3 5.1 3.5 1.4 0.2 Iris-setosa 4 5.1 3.5 1.4 0.2 Iris-setosa ... ... ... ... ... ... 7495 5.9 3.0 5.1 1.8 Iris-virginica 7496 5.9 3.0 5.1 1.8 Iris-virginica 7497 5.9 3.0 5.1 1.8 Iris-virginica 7498 5.9 3.0 5.1 1.8 Iris-virginica 7499 5.9 3.0 5.1 1.8 Iris-virginica Feature_1 Feature_2 Feature_3 Feature_4 0 5.1 3.5 1.4 0.2 1 4.9 3.0 1.4 0.2 2 4.7 3.2 1.3 0.2 3 4.6 3.1 1.5 0.2 4 5.0 3.6 1.4 0.2 ... ... ... ... ... 7495 6.7 3.0 5.2 2.3 7496 6.3 2.5 5.0 1.9 7497 6.5 3.0 5.2 2.0 7498 6.2 3.4 5.4 2.3 7499 5.9 3.0 5.1 1.8 [7500 rows x 9 columns] 合并后DataFrame的形状: (7500, 9)
[3]:
Selection deleted
import pandas as pd
import os
# 指定文件路径
base_path = r"C:\Users\ASUS\Desktop\大二下课程\python数据处理\3"
student_id = "2023014925"
# 读取第一个文件:自己创建的CSV文件
file1_path = os.path.join(base_path, f"{student_id}.csv")
df1 = pd.read_csv(file1_path)
# 读取第二个文件:iris_NaN.csv
file2_path = os.path.join(base_path, "iris_NaN.csv")
df2 = pd.read_csv(file2_path)
# 显示读取的两个DataFrame的形状
print(f"DataFrame 1的形状: {df1.shape}")
print(f"DataFrame 2的形状: {df2.shape}")
# 使用concat方法沿着行方向(axis=0)连接两个DataFrame
concatenated_df = pd.concat([df1, df2], axis=1)
print("姓名:丁晴")
print("学号:2023014925")
# 显示连接后的DataFrame的前几行和后几行
print("连接后DataFrame:")
print(concatenated_df)
# 查看连接后DataFrame的形状(行数和列数)
print(f"\n连接后DataFrame的形状: {concatenated_df.shape}")
DataFrame 1的形状: (150, 5) DataFrame 2的形状: (150, 5) 姓名:丁晴 学号:2023014925 连接后DataFrame: Sepal.Length Sepal.Width Petal.Length Petal.width Species \ 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa .. ... ... ... ... ... 145 6.7 3.0 5.2 2.3 Iris-virginica 146 6.3 2.5 5.0 1.9 Iris-virginica 147 6.5 3.0 5.2 2.0 Iris-virginica 148 6.2 3.4 5.4 2.3 Iris-virginica 149 5.9 3.0 5.1 1.8 Iris-virginica Feature_1 Feature_2 Feature_3 Feature_4 Species 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa .. ... ... ... ... ... 145 6.7 3.0 5.2 2.3 Iris-virginica 146 6.3 2.5 5.0 1.9 Iris-virginica 147 6.5 3.0 5.2 2.0 Iris-virginica 148 6.2 3.4 5.4 2.3 Iris-virginica 149 5.9 3.0 5.1 1.8 Iris-virginica [150 rows x 10 columns] 连接后DataFrame的形状: (150, 10)
[53]:
# a. 使用isnull().sum()检查是否有缺失值
print("检查缺失值:")
missing_values = concatenated_df.isnull().sum()
print(missing_values)
检查缺失值: Sepal.Length 150 Sepal.Width 150 Petal.Length 150 Petal.width 150 Species 0 Feature_1 150 Feature_2 191 Feature_3 191 Feature_4 150 dtype: int64
[51]:
print("姓名:丁晴")
print("学号:2023014925")
df_dropped = concatenated_df.dropna()
print("\n删除缺失值后的DataFrame形状:")
print(f"原始形状: {concatenated_df.shape}, 删除缺失值后: {df_dropped.shape}")
print(f"删除了 {concatenated_df.shape[0] - df_dropped.shape[0]} 行")
df_dropped
姓名:丁晴 学号:2023014925 删除缺失值后的DataFrame形状: 原始形状: (300, 9), 删除缺失值后: (0, 9) 删除了 300 行
[51]:
Sepal.Length | Sepal.Width | Petal.Length | Petal.width | Species | Feature_1 | Feature_2 | Feature_3 | Feature_4 |
---|
[55]:
# c. 使用学号最后一位对缺失值进行填充
df_filled = concatenated_df.fillna(fill_value)
print(f"\n使用学号最后一位 {fill_value} 填充缺失值后的DataFrame:")
print(df_filled.head(10)) # 显示前10行,应该可以看到一些被填充的值
# 确认填充后没有缺失值
print("\n填充后检查缺失值:")
print(df_filled.isnull().sum())
使用学号最后一位 5 填充缺失值后的DataFrame: Sepal.Length Sepal.Width Petal.Length Petal.width Species \ 0 5.1 3.5 1.4 0.2 Iris-setosa 1 4.9 3.0 1.4 0.2 Iris-setosa 2 4.7 3.2 1.3 0.2 Iris-setosa 3 4.6 3.1 1.5 0.2 Iris-setosa 4 5.0 3.6 1.4 0.2 Iris-setosa 5 5.4 3.9 1.7 0.4 Iris-setosa 6 4.6 3.4 1.4 0.3 Iris-setosa 7 5.0 3.4 1.5 0.2 Iris-setosa 8 4.4 2.9 1.4 0.2 Iris-setosa 9 4.9 3.1 1.5 0.1 Iris-setosa Feature_1 Feature_2 Feature_3 Feature_4 0 5.0 5.0 5.0 5.0 1 5.0 5.0 5.0 5.0 2 5.0 5.0 5.0 5.0 3 5.0 5.0 5.0 5.0 4 5.0 5.0 5.0 5.0 5 5.0 5.0 5.0 5.0 6 5.0 5.0 5.0 5.0 7 5.0 5.0 5.0 5.0 8 5.0 5.0 5.0 5.0 9 5.0 5.0 5.0 5.0 填充后检查缺失值: Sepal.Length 0 Sepal.Width 0 Petal.Length 0 Petal.width 0 Species 0 Feature_1 0 Feature_2 0 Feature_3 0 Feature_4 0 dtype: int64
[ ]: