数据分析案例
数据来源:kaggle的TMDB 5000 Movie Dataset数据集
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
# 一、准备操作
# 1. 读取文件
data_credits = pd.read_csv('tmdb_5000_credits.csv',encoding='utf-8')
data_movies = pd.read_csv('tmdb_5000_movies.csv',encoding='utf-8')
# 2. 查看表的信息
print(data_credits.info(),data_movies.info())
'''
data_credicts:
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 movie_id 4803 non-null int64
1 title 4803 non-null object
2 cast 4803 non-null object #演员列表
3 crew 4803 non-null object
data_movies:
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 budget 4803 non-null int64
1 genres 4803 non-null object #风格
2 homepage 1712 non-null object
3 id 4803 non-null int64
4 keywords 4803 non-null object
5 original_language 4803 non-null object
6 original_title 4803 non-null object
7 overview 4800 non-null object
8 popularity 4803 non-null float64
9 production_companies 4803 non-null object
10 production_countries 4803 non-null object
11 release_date 4802 non-null object
12 revenue 4803 non-null int64
13 runtime 4801 non-null float64 #电影时长
14 spoken_languages 4803 non-null object
15 status 4803 non-null object
16 tagline 3959 non-null object
17 title 4803 non-null object
18 vote_average 4803 non-null float64
19 vote_count 4803 non-null int64
'''
# 3. 表的合并
# 以credits中的movie_id和movies中的id为主键进行合并
data = pd.merge(data_credits, data_movies, left_on = ['movie_id','title'], right_on = ['id','title'], how='outer')
print(data.info())
# 二.数据分析
# *1. 电影类型如何随着时间的推移而变化? genres/release_date*
# 由info()可知release_date数据量只有4802 说明有一个空值
# 1)查找空值所在记录的电影名称 [4553 America Is Still the Place]
print(data.loc[data['release_date'].isnull(),'title'])
# 2)百度该电影上映日期 2014-06-01 将其填入空值
data['release_date'].fillna('2014-06-01',inplace=True)
# print(data.info()) # release_date 有4803行 填入成功
# 3)取出所有的电影类型:
# 在genres中 字典是以字符串的形式存放的:
# print(type(data['genres'][0])) <class 'str'>
# 因此我们需要把字符串转变为字典类型 这里使用到一个方法:json.loads
# 参考:https://www.php.cn/python-tutorials-424994.html
data['genres'] = data['genres'].transform(json.loads)
# print(type(data['genres'][0])) #<class 'list'>
# 再把列表中的每一个字典中name键对应的值取出来
type_list = []
for i in range(data['genres'].size): #对每一行数据进行遍历
for dict in data['genres'][i]: #对每一行的列表中每一个字典进行遍历
if len(dict) != 0: #当列表不为空时
type = dict['name'] #取出字典中name的值
type_list.append(type)
message = dict['name']
data.loc[i,['genres']] = ''.join(message) #把列表嵌套的字典类型转换为字符串类型
type_set = set(type_list) #通过集合去重
print(type_set)
print(data['genres'])
for j in range(data['genres'].size):
for dict2 in data['genres'][j]:
if len(dict1) != 0:
message = dict2['name']
data['genres'][j] = ''.join(message)
# 4)把上映时间的字符串类型转换为标准的日期类型
data['release_year'] = pd.to_datetime(data['release_date']).dt.year #新增一列上映年份的列
# 5)字符串操作
for j in list(type_set):
data[j] = data['genres'].str.contains(j).transform(lambda x:1 if x else 0) #某种电影是否包含该类型 并将布尔值转换为数值
# print(data.shape)
# 6)分组
groupby_year = data.groupby(by='release_year')[list(type_set)].sum()
# 7)可视化
plt.figure()
#修改字体
plt.rcParams['font.sans-serif'] = 'SimHei'
#正常显示符号问题
plt.rcParams['axes.unicode_minus'] = False
x = groupby_year.index
for type in list(type_set):
y = groupby_year[type]
print(type)
plt.plot(x, y)
plt.legend(labels=type)
plt.show()
#不同类型的电影总数占比
#不同类型的电影总数
# print(data[list(type_set)].sum().sort_values(ascending=False))
#画饼图
x1 = data[list(type_set)].sum().sort_values(ascending=False)
plt.figure()
plt.pie(x1,labels=list(type_set),autopct='%1.1f%%',pctdistance=0.5)
plt.show()
#画柱状图
plt.figure()
x2 = list(type_set)
y = data[list(type_set)].sum().sort_value()
plt.barh(x2,y)
plt.show()
*#2.电影类型与利润的关系*
#1)计算利润
data['profit'] = data['revenue'] - data['budget']
#2) 每种类型的电影的平均利润
movie_type_profit = []
for movie_type in list(type_set):
# print(data.loc[data[movie_type]==1,'profit'])
profit_avg = data.loc[data[movie_type]==1,'profit'].mean()
movie_type_profit.append(profit_avg)
#构建DataFrame表格
movie = pd.DataFrame({'movie_type':list(type_set),'movie_profit':movie_type_profit})
print(movie)
#可视化
plt.figure()
plt.barh(list(type_set),movie_type_profit)
plt.show()
# 3.Universal和paramount两家公司的对比情况
#在表中找到全称‘Universal Pictures’和‘Paramount Pictures’ 字段名为‘production companies’
#1)构建两列数据,代表两个公司
data['Universal Pictures'] = data['production_companies'].str.contains('Universal Pictures')
data['Paramount Pictures'] = data['production_companies'].str.contains('Paramount Pictures')
# print(data['Universal Pictures']) 返回布尔值
#2)对比盈利情况
U_profit = data.loc[data['Universal Pictures'],'profit'].mean()
P_profit = data.loc[data['Paramount Pictures'],'profit'].mean()
# print(U_profit,P_profit)
# 3) 发行影片的情况
U_sum = data['Universal Pictures'].sum()
P_sum = data['Paramount Pictures'].sum()
# 4)可视化
p1 = plt.figure()
x=[1,2]
labels = ['Universal Pictures' , 'Paramount Pictures']
y1=[U_profit,P_profit]
y2 = [U_sum,P_sum]
p1.add_subplot(1,2,1)
plt.xticks(x,labels)
plt.bar(x,y1)
plt.xticks(x,labels)
plt.legend('两家公司盈利情况')
p1.add_subplot(1,2,2)
plt.bar(x,y2)
plt.xticks(x,labels)
plt.legend('两家公司发行影片情况')
plt.show()