market_assistant/analyze_acv_dist.py

70 lines
3.1 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
from typing import List
from openpyxl import Workbook
def strip_character(column_name, characters: List[str]):
new_col_name = column_name
for character in characters:
new_col_name = new_col_name.replace(character, '')
new_col_name = new_col_name.strip()
return new_col_name
def refine_content(df):
strip_character_list = [' ', '\n', ':', '','其他']
for col in df.columns:
df[col] = df[col].apply(lambda x: "其他" if strip_character(x, strip_character_list) == "" else strip_character(x, strip_character_list))
return df
def get_acv_distribution(df, acv_name, industry_col_name):
# Define the bins for ACV intervals
bins = [0, 1e6, 5e6, float('inf')]
labels = ['<100万', '100万-500万', '>500万']
# Create a new column 'ACV Interval' based on the bins
df['ACV Interval'] = pd.cut(df[acv_name], bins=bins, labels=labels, right=False)
industry_acv_distribution = df.groupby([industry_col_name, 'ACV Interval']).size().unstack(fill_value=0)
industry_acv_distribution.loc['Total'] = industry_acv_distribution.sum()
return industry_acv_distribution
def get_acv_distribution_sum(df, acv_name, industry_col_name):
# Define the bins for ACV intervals
bins = [0, 1e6, 5e6, float('inf')]
labels = ['<100万', '100万-500万', '>500万']
# Create a new column 'ACV Interval' based on the bins
df['ACV Interval'] = pd.cut(df[acv_name], bins=bins, labels=labels, right=False)
# Group by industry and ACV interval, then sum the ACV values
industry_acv_sum = df.groupby([industry_col_name, 'ACV Interval'])[acv_name].sum().unstack(fill_value=0)
# Add a 'Total' row
industry_acv_sum.loc['Total'] = industry_acv_sum.sum()
return industry_acv_sum
# 新增函数将结果保存到Excel
def save_to_excel(dfs, sheet_names, output_file):
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
for df, sheet_name in zip(dfs, sheet_names):
df.to_excel(writer, sheet_name=sheet_name)
# 读取数据
df = pd.read_excel('./data_src/pingcap_won.xlsx')
df_pipeline = pd.read_excel('./data_src/pingcap_pipeline.xlsx')
# 获取各种分布
won_industry_dist = get_acv_distribution(df, 'ACV', '客户分类')
won_sub_industry_dist = get_acv_distribution(df, 'ACV', '客户行业')
pipeline_industry_dist = get_acv_distribution(df_pipeline, '预估 ACV', '负责人所属行业')
pipeline_sub_industry_dist = get_acv_distribution(df_pipeline, '预估 ACV', '客户行业')
pipeline_sub_industry_dist_sum = get_acv_distribution_sum(df_pipeline, '预估 ACV', '客户行业')
pipeline_sub_industry_dist_sum.to_excel('./output/pipeline_sub_industry_dist_sum.xlsx')
# 保存结果到Excel
dfs = [won_industry_dist, won_sub_industry_dist, pipeline_industry_dist, pipeline_sub_industry_dist]
sheet_names = ['成单-行业分布', '成单-子行业分布', 'Pipeline-行业分布', 'Pipeline-子行业分布']
# save_to_excel(dfs, sheet_names, './output/acv_distribution.xlsx')
print("ACV distribution analysis completed. Results saved in './output/acv_distribution.xlsx'")