{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-01-13T01:30:08.643756Z", "start_time": "2025-01-13T01:30:06.177472Z" } }, "source": [ "import pandas as pd\n", "import tushare as ts\n", "\n", "ts_pro = ts.pro_api(token=\"64ebff4fa679167600b905ee45dd88e76f3963c0ff39157f3f085f0e\")" ], "outputs": [], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-13T01:30:08.652416Z", "start_time": "2025-01-13T01:30:08.646084Z" } }, "cell_type": "code", "source": [ "def get_balance_sheet_df(start_year, end_year):\n", " result = ts_pro.balancesheet_vip(period=f\"{start_year}1231\")\n", " print(f\"Pull balance sheet: {start_year}\")\n", " for year in range(start_year + 1, end_year + 1):\n", " print(f\"Pull balance sheet: {year}\")\n", " period = f\"{year}1231\"\n", " temp = ts_pro.balancesheet_vip(period=period)\n", " result = pd.concat([result, temp], ignore_index=True)\n", " return result\n", "\n", "\n", "def get_income_df(start_year, end_year):\n", " result = ts_pro.income_vip(period=f\"{start_year}1231\")\n", " print(f\"Pull income: {start_year}\")\n", " for year in range(start_year + 1, end_year + 1):\n", " print(f\"Pull income: {year}\")\n", " period = f\"{year}1231\"\n", " temp = ts_pro.income_vip(period=period)\n", " result = pd.concat([result, temp], ignore_index=True)\n", " return result\n", "\n", "\n", "def get_cash_flow_df(start_year, end_year):\n", " result = ts_pro.cashflow_vip(period=f\"{start_year}1231\")\n", " print(f\"Pull cash flow: {start_year}\")\n", " for year in range(start_year + 1, end_year + 1):\n", " print(f\"Pull cash flow: {year}\")\n", " period = f\"{year}1231\"\n", " temp = ts_pro.cashflow_vip(period=period)\n", " result = pd.concat([result, temp], ignore_index=True)\n", " return result\n", "\n", "\n", "def clean_df(df):\n", " df = df.drop_duplicates(subset=[\"ts_code\", \"end_date\"])\n", " df[\"end_date\"] = df[\"end_date\"].str[:4]\n", " return df" ], "id": "14a28ff4952f0df8", "outputs": [], "execution_count": 2 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-13T01:30:08.656800Z", "start_time": "2025-01-13T01:30:08.653794Z" } }, "cell_type": "code", "source": [ "start_year = 2014\n", "end_year = 2024" ], "id": "dc68cde196159626", "outputs": [], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-13T01:39:39.349170Z", "start_time": "2025-01-13T01:30:08.660347Z" } }, "cell_type": "code", "source": [ "# 财务负债表\n", "balance_sheet_df = clean_df(get_balance_sheet_df(start_year, end_year))\n", "balance_sheet_df.to_csv(\"../temp/balance_sheet.csv\", index=False)" ], "id": "33cd797a12ad567e", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pull balance sheet: 2014\n", "Pull balance sheet: 2015\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7h/w0cmp4zj6mn9br_6nyj310m40000gn/T/ipykernel_50121/709533518.py:8: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " result = pd.concat([result, temp], ignore_index=True)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Pull balance sheet: 2016\n", "Pull balance sheet: 2017\n", "Pull balance sheet: 2018\n", "Pull balance sheet: 2019\n", "Pull balance sheet: 2020\n", "Pull balance sheet: 2021\n", "Pull balance sheet: 2022\n", "Pull balance sheet: 2023\n", "Pull balance sheet: 2024\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7h/w0cmp4zj6mn9br_6nyj310m40000gn/T/ipykernel_50121/709533518.py:8: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " result = pd.concat([result, temp], ignore_index=True)\n" ] } ], "execution_count": 4 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-13T01:43:28.007135Z", "start_time": "2025-01-13T01:39:39.363037Z" } }, "cell_type": "code", "source": [ "income_df = clean_df(get_income_df(start_year, end_year))\n", "income_df.to_csv(\"../temp/income.csv\", index=False)" ], "id": "17306c1524f5e173", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pull income: 2014\n", "Pull income: 2015\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7h/w0cmp4zj6mn9br_6nyj310m40000gn/T/ipykernel_50121/709533518.py:19: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " result = pd.concat([result, temp], ignore_index=True)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Pull income: 2016\n", "Pull income: 2017\n", "Pull income: 2018\n", "Pull income: 2019\n", "Pull income: 2020\n", "Pull income: 2021\n", "Pull income: 2022\n", "Pull income: 2023\n", "Pull income: 2024\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7h/w0cmp4zj6mn9br_6nyj310m40000gn/T/ipykernel_50121/709533518.py:19: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " result = pd.concat([result, temp], ignore_index=True)\n" ] } ], "execution_count": 5 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-13T01:46:46.375157Z", "start_time": "2025-01-13T01:43:28.011213Z" } }, "cell_type": "code", "source": [ "cash_flow_df = clean_df(get_cash_flow_df(start_year, end_year))\n", "cash_flow_df.to_csv(\"../temp/cash_flow.csv\", index=False)" ], "id": "334dbe20f2047a1e", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pull cash flow: 2014\n", "Pull cash flow: 2015\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7h/w0cmp4zj6mn9br_6nyj310m40000gn/T/ipykernel_50121/709533518.py:30: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " result = pd.concat([result, temp], ignore_index=True)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Pull cash flow: 2016\n", "Pull cash flow: 2017\n", "Pull cash flow: 2018\n", "Pull cash flow: 2019\n", "Pull cash flow: 2020\n", "Pull cash flow: 2021\n", "Pull cash flow: 2022\n", "Pull cash flow: 2023\n", "Pull cash flow: 2024\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/7h/w0cmp4zj6mn9br_6nyj310m40000gn/T/ipykernel_50121/709533518.py:30: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", " result = pd.concat([result, temp], ignore_index=True)\n" ] } ], "execution_count": 6 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-13T01:46:57.717599Z", "start_time": "2025-01-13T01:46:46.377806Z" } }, "cell_type": "code", "source": [ "finance_df = pd.merge(balance_sheet_df, income_df, on=[\"ts_code\", \"end_date\"])\n", "finance_df = pd.merge(finance_df, cash_flow_df, on=[\"ts_code\", \"end_date\"])\n", "finance_df.to_csv(\"../temp/finance.csv\", index=False)" ], "id": "f8bea62f377b5e2", "outputs": [], "execution_count": 7 }, { "metadata": { "ExecuteTime": { "end_time": "2025-01-13T01:46:57.814028Z", "start_time": "2025-01-13T01:46:57.728111Z" } }, "cell_type": "code", "source": "finance_df", "id": "b14b477ca3c0f720", "outputs": [ { "data": { "text/plain": [ " ts_code ann_date_x f_ann_date_x end_date report_type_x comp_type_x \\\n", "0 830964.BJ 20180103 20180103 2014 1 1 \n", "1 834765.BJ 20180105 20180105 2014 1 1 \n", "2 835174.BJ 20180130 20180130 2014 1 1 \n", "3 301076.SZ 20180117 20180117 2014 1 1 \n", "4 601528.SH 20180116 20180116 2014 1 2 \n", "... ... ... ... ... ... ... \n", "47434 603260.SH 20240430 20240430 2023 1 1 \n", "47435 603828.SH 20240430 20240430 2023 1 1 \n", "47436 002120.SZ 20240430 20240430 2023 1 1 \n", "47437 000790.SZ 20240430 20240430 2023 1 1 \n", "47438 000504.SZ 20240430 20240430 2023 1 1 \n", "\n", " end_type_x total_share cap_rese undistr_porfit ... \\\n", "0 4 6.800000e+07 1.949319e+07 5.757873e+06 ... \n", "1 4 1.500000e+07 2.169516e+07 9.042014e+06 ... \n", "2 4 5.714286e+07 NaN -7.873967e+06 ... \n", "3 4 5.000000e+07 NaN 1.800787e+07 ... \n", "4 4 1.197900e+09 3.048310e+08 1.363431e+09 ... \n", "... ... ... ... ... ... \n", "47434 4 1.182207e+09 1.138407e+10 1.921573e+10 ... \n", "47435 4 5.959602e+08 2.395205e+08 -1.640928e+08 ... \n", "47436 4 2.899193e+09 2.810602e+09 1.234578e+10 ... \n", "47437 4 6.281426e+08 7.375118e+07 2.299767e+08 ... \n", "47438 4 3.115739e+08 4.234729e+08 -5.311005e+08 ... \n", "\n", " net_dism_capital_add net_cash_rece_sec credit_impa_loss \\\n", "0 NaN NaN NaN \n", "1 NaN NaN NaN \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "... ... ... ... \n", "47434 NaN NaN NaN \n", "47435 NaN NaN 3.794784e+07 \n", "47436 NaN NaN 1.401759e+08 \n", "47437 NaN NaN NaN \n", "47438 NaN NaN NaN \n", "\n", " use_right_asset_dep oth_loss_asset end_bal_cash beg_bal_cash \\\n", "0 NaN None NaN NaN \n", "1 NaN None NaN NaN \n", "2 NaN None NaN NaN \n", "3 NaN None NaN NaN \n", "4 NaN None NaN NaN \n", "... ... ... ... ... \n", "47434 7.167836e+06 None 1.034290e+09 1.822382e+09 \n", "47435 4.186533e+06 None 1.255399e+08 1.020296e+08 \n", "47436 5.239484e+08 None 4.313758e+09 3.507770e+09 \n", "47437 2.214418e+06 None 1.164838e+08 3.787762e+07 \n", "47438 3.289952e+06 None 4.937811e+08 2.181183e+08 \n", "\n", " end_bal_cash_equ beg_bal_cash_equ update_flag \n", "0 NaN NaN 0 \n", "1 NaN NaN 0 \n", "2 NaN NaN 0 \n", "3 NaN NaN 1 \n", "4 NaN NaN 1 \n", "... ... ... ... \n", "47434 NaN NaN 1 \n", "47435 NaN NaN 0 \n", "47436 NaN NaN 1 \n", "47437 NaN NaN 0 \n", "47438 NaN NaN 0 \n", "\n", "[47439 rows x 329 columns]" ], "text/html": [ "
| \n", " | ts_code | \n", "ann_date_x | \n", "f_ann_date_x | \n", "end_date | \n", "report_type_x | \n", "comp_type_x | \n", "end_type_x | \n", "total_share | \n", "cap_rese | \n", "undistr_porfit | \n", "... | \n", "net_dism_capital_add | \n", "net_cash_rece_sec | \n", "credit_impa_loss | \n", "use_right_asset_dep | \n", "oth_loss_asset | \n", "end_bal_cash | \n", "beg_bal_cash | \n", "end_bal_cash_equ | \n", "beg_bal_cash_equ | \n", "update_flag | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "830964.BJ | \n", "20180103 | \n", "20180103 | \n", "2014 | \n", "1 | \n", "1 | \n", "4 | \n", "6.800000e+07 | \n", "1.949319e+07 | \n", "5.757873e+06 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "
| 1 | \n", "834765.BJ | \n", "20180105 | \n", "20180105 | \n", "2014 | \n", "1 | \n", "1 | \n", "4 | \n", "1.500000e+07 | \n", "2.169516e+07 | \n", "9.042014e+06 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "
| 2 | \n", "835174.BJ | \n", "20180130 | \n", "20180130 | \n", "2014 | \n", "1 | \n", "1 | \n", "4 | \n", "5.714286e+07 | \n", "NaN | \n", "-7.873967e+06 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "0 | \n", "
| 3 | \n", "301076.SZ | \n", "20180117 | \n", "20180117 | \n", "2014 | \n", "1 | \n", "1 | \n", "4 | \n", "5.000000e+07 | \n", "NaN | \n", "1.800787e+07 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "1 | \n", "
| 4 | \n", "601528.SH | \n", "20180116 | \n", "20180116 | \n", "2014 | \n", "1 | \n", "2 | \n", "4 | \n", "1.197900e+09 | \n", "3.048310e+08 | \n", "1.363431e+09 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "None | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "1 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 47434 | \n", "603260.SH | \n", "20240430 | \n", "20240430 | \n", "2023 | \n", "1 | \n", "1 | \n", "4 | \n", "1.182207e+09 | \n", "1.138407e+10 | \n", "1.921573e+10 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "7.167836e+06 | \n", "None | \n", "1.034290e+09 | \n", "1.822382e+09 | \n", "NaN | \n", "NaN | \n", "1 | \n", "
| 47435 | \n", "603828.SH | \n", "20240430 | \n", "20240430 | \n", "2023 | \n", "1 | \n", "1 | \n", "4 | \n", "5.959602e+08 | \n", "2.395205e+08 | \n", "-1.640928e+08 | \n", "... | \n", "NaN | \n", "NaN | \n", "3.794784e+07 | \n", "4.186533e+06 | \n", "None | \n", "1.255399e+08 | \n", "1.020296e+08 | \n", "NaN | \n", "NaN | \n", "0 | \n", "
| 47436 | \n", "002120.SZ | \n", "20240430 | \n", "20240430 | \n", "2023 | \n", "1 | \n", "1 | \n", "4 | \n", "2.899193e+09 | \n", "2.810602e+09 | \n", "1.234578e+10 | \n", "... | \n", "NaN | \n", "NaN | \n", "1.401759e+08 | \n", "5.239484e+08 | \n", "None | \n", "4.313758e+09 | \n", "3.507770e+09 | \n", "NaN | \n", "NaN | \n", "1 | \n", "
| 47437 | \n", "000790.SZ | \n", "20240430 | \n", "20240430 | \n", "2023 | \n", "1 | \n", "1 | \n", "4 | \n", "6.281426e+08 | \n", "7.375118e+07 | \n", "2.299767e+08 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "2.214418e+06 | \n", "None | \n", "1.164838e+08 | \n", "3.787762e+07 | \n", "NaN | \n", "NaN | \n", "0 | \n", "
| 47438 | \n", "000504.SZ | \n", "20240430 | \n", "20240430 | \n", "2023 | \n", "1 | \n", "1 | \n", "4 | \n", "3.115739e+08 | \n", "4.234729e+08 | \n", "-5.311005e+08 | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "3.289952e+06 | \n", "None | \n", "4.937811e+08 | \n", "2.181183e+08 | \n", "NaN | \n", "NaN | \n", "0 | \n", "
47439 rows × 329 columns
\n", "