stock.csv
movie_data_cleaned.csv
'''file: pandas_def.py'''import pandas as pdimport numpy as nppd.set_option('display.max_columns', None)pd.set_option('display.max_rows', None)file = "movie_data_cleaned.csv"def movie_year_amount_tj():data = pd.read_csv(file)data['release_date'] = pd.to_datetime(data['release_date'])data = data.set_index(data['release_date'])data_year_tj = data['release_date'].resample('Y').count()return pd.DataFrame(data_year_tj)def country_year_tj():data = pd.read_csv(file,usecols=['title', 'country', 'language', 'release_date', 'average'])data = data[['title', 'country', 'language', 'release_date', 'average']]# 各国每年的电影产量data['country'] = data['country'].str.strip(' ')data['country'] = data['country'].fillna(value='')country_list = []for c in data['country']:c_list = c.split(' / ')for label in c_list: # 123,2,3 -> 1,2,3country_list.append(label)country_list = list(set(country_list))country_list.remove('')country_list.remove('美国/澳大利亚')country_list.remove('捷克斯洛伐克/捷克')country_list.remove('中国大陆') # 只统计中国country_list.remove('中国香港')country_list.remove('中国台湾')data['release_date'] = pd.to_datetime(data['release_date'])data = data.set_index(data['release_date'])count = 0tj = pd.DataFrame(data['release_date'].resample('Y').count())tj = tj.drop(columns='release_date')for label in country_list:temp = data[data['country'].str.contains(label)]print("=====================================")print("标签=", label)print("总频数=", len(temp))count += len(temp)tj[label] = temp['release_date'].resample('Y').count()tj = tj.fillna(value=0)return tjdef language_tj():data = pd.read_csv(file,usecols=['title', 'country', 'language', 'release_date', 'average'])data = data[['title', 'country', 'language', 'release_date', 'average']]# label统计 -> listdata['language'] = data['language'].str.strip(' ')data['language'] = data['language'].fillna(value='')lang_list = []for l in data['language']:l_list = l.split(' / ')for label in l_list: # 123,2,3 -> 1,2,3lang_list.append(label)lang_list = list(set(lang_list))lang_list.remove('')lang_list.remove('汉语普通话')# 统计每个类型标签对应的电影量/条数/频数data_lang_tj = pd.DataFrame(np.zeros([len(lang_list), 1]),index=lang_list, columns=['tj']) # 2列:标签,统计值tjfor i in data['language']:for label in lang_list:if str(i).__contains__(label):data_lang_tj.loc[label, 'tj'] += 1# 将小类汇总为大类,并添加至统计dfchinese_fy = data_lang_tj.loc['湖南话', 'tj'] + data_lang_tj.loc['北京话', 'tj']# print(chinese_fy)data_lang_tj.loc['中国方言', 'tj'] = chinese_fyreturn data_lang_tjdef averge_votes():return pd.read_csv(file, usecols=['average', 'votes', 'title'])def genre_tj():# 读取数据data = pd.read_csv(file)# 获取所有类型:提取每一行的genre元素 -> 新的列表 genre_list -> 去重data['genre'] = data['genre'].str.strip('[')data['genre'] = data['genre'].str.strip(']')data['genre'] = data['genre'].fillna(value='')genre_list = []for g in data['genre']:g_list = g.split(', ')for label in g_list: # 123,2,3 -> 1,2,3genre_list.append(label)g_list = list(set(genre_list))g_list.remove('')# 统计每个类型标签对应的电影量/条数/频数data_genre_tj = pd.DataFrame(np.zeros([len(g_list), 1]),index=g_list, columns=['tj']) # 2列:标签,统计值tjfor i in data['genre']:for label in g_list:if str(i).__contains__(label):data_genre_tj.loc[label, 'tj'] += 1return data_genre_tjdef genre_rates_tj(x):''':param x: 前X为:return: 排名前X位的电影类型标签,对应的评分均值数据'''# 电影类型(x):6个标签,电影数量最多的# 评分数据(y):2,3,4,5,6,7,8,9,10# 电影数量(值)genre_list = genre_tj().sort_values('tj', ascending=False) \.head(x).index.tolist()rate_list = [2, 3, 4, 5, 6, 7, 8, 9, 10]data = pd.read_csv(file, usecols=['genre', 'average'])# 统计满足类型与评分区间的数据data_genre_tj = pd.DataFrame(np.zeros([len(rate_list), len(genre_list)]),index=rate_list, columns=genre_list)for g in genre_list: # 循环遍历6个变迁元素for rate in rate_list: # 循环遍历2-10评分数值for i, r in zip(data['genre'], data['average']):if str(i).__contains__(g) and rate <= r < rate + 1:data_genre_tj.loc[rate, g] += 1return data_genre_tjdef rate_tj_by_year(year_list):data = pd.read_csv(file,usecols=['title', 'average', 'release_date'])data = data.set_index(pd.to_datetime(data['release_date']))tj = []for year in year_list:tj.append(data[year]['average'].tolist())return tjif __name__ == '__main__':# print(movie_year_amount_tj())# print(genre_rates_tj(6))print(rate_tj_by_year(['2015', '2016']))
