init

3 years ago · 0f18a40b56
commit 0f18a40b56
65 changed files with 36653 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,168 @@
+_data/
+
+*.pkl
+*.csv
+*.swp
+*.swo
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
--- a/qlib-demo.ipynb
+++ b/qlib-demo.ipynb
--- a/qlib-pit-ops.ipynb
+++ b/qlib-pit-ops.ipynb
@ -0,0 +1,316 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f15b133a-12f3-4803-830d-820a471add0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import qlib\n",
+    "from qlib.data import D\n",
+    "from qlib.constant import REG_CN\n",
+    "\n",
+    "from qlib.data.pit import P, PRef"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b6e1089e-e51d-42ac-aee8-53d1a886a355",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pprint import pprint \n",
+    "\n",
+    "class PRelRef(P):\n",
+    "    \n",
+    "    def __init__(self, feature, rel_period):\n",
+    "        super().__init__(feature)\n",
+    "        self.rel_period = rel_period\n",
+    "        self.unit = unit\n",
+    "        \n",
+    "    def __str__(self):\n",
+    "        return f\"{super().__str__()}[{self.rel_period, self.unit}]\"\n",
+    "    \n",
+    "    def _load_feature(self, instrucument, start_index, end_index, cur_time):\n",
+    "        #pprint(f\"{start_index}, {end_index}\")\n",
+    "        #pprint(f\"{self.feature.get_longest_back_rolling()}, {self.feature.get_extended_window_size()}\")\n",
+    "        #pprint(f\"{cur_time}, {self.rel_period}, {self.unit}\")\n",
+    "        return self.feature.load(instrucument, start_index, end_index, cur_time, self.rel_period)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "8730a9fb-9356-4847-b33d-370a3095df04",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from qlib.data.ops import ElemOperator\n",
+    "from qlib.data.data import Cal\n",
+    "\n",
+    "def is_of_quarter(period:int, quarter:int) -> bool:\n",
+    "    return (period - quarter) % 100 == 0\n",
+    "\n",
+    "\n",
+    "class PDiff(P):\n",
+    "    \"\"\"\n",
+    "    还是继承P，而不是EleOperator，以减少麻烦。\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, feature, **kwargs):\n",
+    "        super().__init__(feature)\n",
+    "        self.rel_period = 1 if 'rel_period' not in kwargs else kwargs['rel_period']\n",
+    "        self.skip_q1 = False if 'skip_q1' not in kwargs else kwargs['skip_q1']\n",
+    "        \n",
+    "    def _load_internal(self, instrument, start_index, end_index, freq):\n",
+    "        _calendar = Cal.calendar(freq=freq)\n",
+    "        resample_data = np.empty(end_index - start_index + 1, dtype=\"float32\")\n",
+    "        \n",
+    "        # 对日期区间逐一循环，考虑到使用PIT数据的模型一般最多到日频，单个股票序列长度最多到千级\n",
+    "        for cur_index in range(start_index, end_index + 1):\n",
+    "            cur_time = _calendar[cur_index]\n",
+    "            # To load expression accurately, more historical data are required\n",
+    "            start_ws, end_ws = self.get_extended_window_size()\n",
+    "            if end_ws > 0:\n",
+    "                raise ValueError(\n",
+    "                    \"PIT database does not support referring to future period (e.g. expressions like `Ref('$$roewa_q', -1)` are not supported\"\n",
+    "                )\n",
+    "\n",
+    "            # The calculated value will always the last element, so the end_offset is zero.\n",
+    "            try:\n",
+    "                s = self._load_feature(instrument, -start_ws, 0, cur_time)\n",
+    "                pprint(s)\n",
+    "                # 满足不需要做diff的条件：在需要跳过一季度的前提下，当前引用的财报期确实为一季度\n",
+    "                if self.skip_q1 or is_of_quarter(s.index[-1], 1):\n",
+    "                    resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan\n",
+    "                else:\n",
+    "                    resample_data[cur_index - start_index] = (s.iloc[-1] - s.iloc[-2]) if len(s) > 1 else np.nan\n",
+    "            except FileNotFoundError:\n",
+    "                get_module_logger(\"base\").warning(f\"WARN: period data not found for {str(self)}\")\n",
+    "                return pd.Series(dtype=\"float32\", name=str(self))\n",
+    "\n",
+    "        resample_series = pd.Series(\n",
+    "            resample_data, index=pd.RangeIndex(start_index, end_index + 1), dtype=\"float32\", name=str(self)\n",
+    "        )\n",
+    "        return resample_series\n",
+    "\n",
+    "    def get_longest_back_rolling(self):\n",
+    "        return self.feature.get_longest_back_rolling() + self.rel_period\n",
+    "    \n",
+    "    def get_extended_window_size(self):\n",
+    "        # 这里需要考虑的是feature的windows size，而不仅仅是自身的windows size\n",
+    "        lft_etd, rght_etd = self.feature.get_extended_window_size()\n",
+    "        return lft_etd + self.rel_period, rght_etd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "abe001d9-ccb4-48a8-b5d0-91ef8862b14f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "class PPairDiff(PairOperator):\n",
+    "    \n",
+    "    def __init__(self, feature_left, feature_right, **kwargs):\n",
+    "        super().__init__(feature_left, feature_right)\n",
+    "        self.rel_period = 1 if 'rel_period' not in kwargs else kwargs['rel_period']\n",
+    "    \n",
+    "\n",
+    "    def _load_internal(self, instrument, start_index, end_index, *args):\n",
+    "        assert any(\n",
+    "            [isinstance(self.feature_left, Expression), self.feature_right, Expression]\n",
+    "        ), \"at least one of two inputs is Expression instance\"\n",
+    "\n",
+    "        if isinstance(self.feature_left, Expression):\n",
+    "            series_left = self.feature_left.load(instrument, start_index, end_index, *args)\n",
+    "        else:\n",
+    "            series_left = self.feature_left  # numeric value\n",
+    "        if isinstance(self.feature_right, Expression):\n",
+    "            series_right = self.feature_right.load(instrument, start_index, end_index, *args)\n",
+    "        else:\n",
+    "            series_right = self.feature_right\n",
+    "\n",
+    "        if self.N == 0:\n",
+    "            series = getattr(series_left.expanding(min_periods=1), self.func)(series_right)\n",
+    "        else:\n",
+    "            series = getattr(series_left.rolling(self.N, min_periods=1), self.func)(series_right)\n",
+    "        return series\n",
+    "    \n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "c7272be7-8df1-47b0-b18e-b631aca6e3cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[41422:MainThread](2022-07-14 15:01:12,046) INFO - qlib.Initialization - [config.py:413] - default_conf: client.\n",
+      "[41422:MainThread](2022-07-14 15:01:12,053) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.\n",
+      "[41422:MainThread](2022-07-14 15:01:12,057) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/guofu/Workspaces/guofu/TslDataFeed/_data/test/target')}\n"
+     ]
+    }
+   ],
+   "source": [
+    "qlib.init(provider_uri='_data/test/target/', region=REG_CN, custom_ops=[PDiff, PPairDiff])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "3939574b-80f4-48db-bd16-1636f92b2e02",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>PDiff($$净利润_q, skip_q1=True)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>instrument</th>\n",
+       "      <th>datetime</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"6\" valign=\"top\">sh600000</th>\n",
+       "      <th>2021-03-26</th>\n",
+       "      <td>1.593600e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-29</th>\n",
+       "      <td>1.380300e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-30</th>\n",
+       "      <td>1.380300e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-31</th>\n",
+       "      <td>1.380300e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-01</th>\n",
+       "      <td>1.380300e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-02</th>\n",
+       "      <td>1.380300e+10</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       PDiff($$净利润_q, skip_q1=True)\n",
+       "instrument datetime                                \n",
+       "sh600000   2021-03-26                  1.593600e+10\n",
+       "           2021-03-29                  1.380300e+10\n",
+       "           2021-03-30                  1.380300e+10\n",
+       "           2021-03-31                  1.380300e+10\n",
+       "           2021-04-01                  1.380300e+10\n",
+       "           2021-04-02                  1.380300e+10"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "D.features(['sh600000'], ['PDiff($$净利润_q, skip_q1=True)'], start_time='2021-03-26', end_time='2021-04-02', freq=\"day\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "679d0bcd-6975-43f3-b394-5e2e775a9b8f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "c1ef5dbb-930f-4d2d-ac3a-5287ddac6d6c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(202003 - 2) % 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c01d2ab-eafa-4492-bbe5-6e3c8382bd3c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/qlib-pit-w-daily.ipynb
+++ b/qlib-pit-w-daily.ipynb
@ -0,0 +1,983 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ac9e74d6-d890-42a3-80e1-3b84a18c5c36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from datetime import datetime\n",
+    "\n",
+    "export_cols_bs = {\n",
+    "    '存货' : 'INVENTORIES',\n",
+    "    '负债合计' : 'TOT_LIAB',\n",
+    "    '流动资产合计' : 'TOT_CUR_ASSETS'\n",
+    "}\n",
+    "\n",
+    "export_cols_pl = {\n",
+    "    '净利润' : 'NET_PROFIT_INCL_MIN_INT_INC',\n",
+    "    '营业外收入' : 'PLUS_NON_OPER_REV',\n",
+    "    '营业收入' : 'OPER_REV'\n",
+    "}\n",
+    "\n",
+    "meta_rename_dict = {\n",
+    "    'StockID':'symbol',                 # \n",
+    "    '数据报告期':'appear_in_period',    # 财报对应的报告期，但同一期财报中可能对历史财报进行修正\n",
+    "    '公布日':'date',                    # 公布日之后，公开信息才能看到当期报告中的所有数据\n",
+    "    '截止日':'period',                  # 公布的数据对应的报告期，如果`截止日 < 报告期`，则为修正财报\n",
+    "    '预案公布日':'date'\n",
+    "}\n",
+    "\n",
+    "def adj_split(df):\n",
+    "    df.reset_index(inplace=True)\n",
+    "\n",
+    "    df_ori = df[df['period'] == df['appear_in_period']]\n",
+    "    df_adj = df[df['period'] != df['appear_in_period']]\n",
+    "    return df_ori, df_adj\n",
+    "\n",
+    "\n",
+    "def to_qlib_format_pl(df):\n",
+    "    df.rename(columns=meta_rename_dict, inplace=True)\n",
+    "    \n",
+    "    def _date_format(df, col):    \n",
+    "        df[col] = pd.to_datetime(df[col].astype(str), format='%Y%m%d')\n",
+    "        df[col] = df[col].dt.strftime('%Y-%m-%d')\n",
+    "        \n",
+    "    index_cols = ['period', 'appear_in_period', 'date']\n",
+    "    for col in index_cols:\n",
+    "        _date_format(df, col)\n",
+    "        \n",
+    "    sel_cols = index_cols + list(export_cols_pl.keys())    \n",
+    "    df_export = df[sel_cols]\n",
+    "    df_export.set_index(index_cols, inplace=True)\n",
+    "\n",
+    "    df_export_ori, df_export_adj = adj_split(df_export)\n",
+    "    df_export_ori.set_index(index_cols, inplace=True)\n",
+    "    df_export_adj.set_index(index_cols, inplace=True)\n",
+    "\n",
+    "    adj_col_rename = {name : name+'（调整）' for name in export_cols_pl.keys()}\n",
+    "    df_export_adj.rename(columns=adj_col_rename, inplace=True)\n",
+    "\n",
+    "    df_list = []\n",
+    "\n",
+    "    def _T(df, df_list):\n",
+    "        for col in list(df.columns):\n",
+    "            df_tmp = df[[col]].copy(deep=True)\n",
+    "            df_tmp['field'] = col\n",
+    "            df_tmp.rename(columns={col:'value'}, inplace=True)\n",
+    "            df_list.append(df_tmp)\n",
+    "        \n",
+    "    _T(df_export_adj, df_list)\n",
+    "    _T(df_export_ori, df_list)\n",
+    "\n",
+    "    df = pd.concat(df_list, axis=0)\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "cd9eb5ec-0074-4b0d-8308-ca17ae49c31f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('_data/tinysoft-data/基础报表/46.合并利润分配表/SH600000.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2480a86d-2b9c-43a5-8554-b54b62cca168",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_2946/1755015898.py:52: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df_export_adj.rename(columns=adj_col_rename, inplace=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "df2 = to_qlib_format_pl(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5b1ce032-2e90-4d60-a1a2-5bb1614eac1b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>value</th>\n",
+       "      <th>field</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>period</th>\n",
+       "      <th>appear_in_period</th>\n",
+       "      <th>date</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"5\" valign=\"top\">2000-06-30</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">2001-06-30</th>\n",
+       "      <th>2001-08-22</th>\n",
+       "      <td>5.009706e+08</td>\n",
+       "      <td>净利润（调整）</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2001-08-22</th>\n",
+       "      <td>2.662224e+09</td>\n",
+       "      <td>营业收入（调整）</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">2000-06-30</th>\n",
+       "      <th>2000-07-29</th>\n",
+       "      <td>5.302665e+08</td>\n",
+       "      <td>净利润</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2000-07-29</th>\n",
+       "      <td>2.161194e+09</td>\n",
+       "      <td>营业收入</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2000-07-29</th>\n",
+       "      <td>7.108678e+05</td>\n",
+       "      <td>营业外收入</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <th>...</th>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">2021-12-31</th>\n",
+       "      <th rowspan=\"2\" valign=\"top\">2021-12-31</th>\n",
+       "      <th>2022-04-28</th>\n",
+       "      <td>5.376600e+10</td>\n",
+       "      <td>净利润</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2022-04-28</th>\n",
+       "      <td>1.909820e+11</td>\n",
+       "      <td>营业收入</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"3\" valign=\"top\">2022-03-31</th>\n",
+       "      <th rowspan=\"3\" valign=\"top\">2022-03-31</th>\n",
+       "      <th>2022-04-28</th>\n",
+       "      <td>1.959700e+10</td>\n",
+       "      <td>净利润</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2022-04-28</th>\n",
+       "      <td>9.000000e+06</td>\n",
+       "      <td>营业外收入</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2022-04-28</th>\n",
+       "      <td>5.000200e+10</td>\n",
+       "      <td>营业收入</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>504 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                               value     field\n",
+       "period     appear_in_period date                              \n",
+       "2000-06-30 2001-06-30       2001-08-22  5.009706e+08   净利润（调整）\n",
+       "                            2001-08-22  2.662224e+09  营业收入（调整）\n",
+       "           2000-06-30       2000-07-29  5.302665e+08       净利润\n",
+       "                            2000-07-29  2.161194e+09      营业收入\n",
+       "                            2000-07-29  7.108678e+05     营业外收入\n",
+       "...                                              ...       ...\n",
+       "2021-12-31 2021-12-31       2022-04-28  5.376600e+10       净利润\n",
+       "                            2022-04-28  1.909820e+11      营业收入\n",
+       "2022-03-31 2022-03-31       2022-04-28  1.959700e+10       净利润\n",
+       "                            2022-04-28  9.000000e+06     营业外收入\n",
+       "                            2022-04-28  5.000200e+10      营业收入\n",
+       "\n",
+       "[504 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2.sort_values('period')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "383e0b65-f01f-403f-a649-e4f857df2ab1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>value</th>\n",
+       "      <th>field</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>period</th>\n",
+       "      <th>date</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2000-06-30</th>\n",
+       "      <th>2000-07-29</th>\n",
+       "      <td>5.302665e+08</td>\n",
+       "      <td>净利润</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2000-12-31</th>\n",
+       "      <th>2001-04-29</th>\n",
+       "      <td>9.579047e+08</td>\n",
+       "      <td>净利润</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2001-06-30</th>\n",
+       "      <th>2001-08-22</th>\n",
+       "      <td>5.349113e+08</td>\n",
+       "      <td>净利润</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2001-09-30</th>\n",
+       "      <th>2001-10-29</th>\n",
+       "      <td>2.870819e+08</td>\n",
+       "      <td>净利润</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2001-12-31</th>\n",
+       "      <th>2002-03-21</th>\n",
+       "      <td>1.061878e+09</td>\n",
+       "      <td>净利润</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2020-03-31</th>\n",
+       "      <th>2021-04-30</th>\n",
+       "      <td>5.542400e+10</td>\n",
+       "      <td>营业收入（调整）</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2020-06-30</th>\n",
+       "      <th>2021-08-28</th>\n",
+       "      <td>1.014070e+11</td>\n",
+       "      <td>营业收入（调整）</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2020-09-30</th>\n",
+       "      <th>2021-10-30</th>\n",
+       "      <td>1.487310e+11</td>\n",
+       "      <td>营业收入（调整）</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2020-12-31</th>\n",
+       "      <th>2022-04-28</th>\n",
+       "      <td>1.963840e+11</td>\n",
+       "      <td>营业收入（调整）</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-31</th>\n",
+       "      <th>2022-04-28</th>\n",
+       "      <td>4.952200e+10</td>\n",
+       "      <td>营业收入（调整）</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>504 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              value     field\n",
+       "period     date                              \n",
+       "2000-06-30 2000-07-29  5.302665e+08       净利润\n",
+       "2000-12-31 2001-04-29  9.579047e+08       净利润\n",
+       "2001-06-30 2001-08-22  5.349113e+08       净利润\n",
+       "2001-09-30 2001-10-29  2.870819e+08       净利润\n",
+       "2001-12-31 2002-03-21  1.061878e+09       净利润\n",
+       "...                             ...       ...\n",
+       "2020-03-31 2021-04-30  5.542400e+10  营业收入（调整）\n",
+       "2020-06-30 2021-08-28  1.014070e+11  营业收入（调整）\n",
+       "2020-09-30 2021-10-30  1.487310e+11  营业收入（调整）\n",
+       "2020-12-31 2022-04-28  1.963840e+11  营业收入（调整）\n",
+       "2021-03-31 2022-04-28  4.952200e+10  营业收入（调整）\n",
+       "\n",
+       "[504 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df2.reset_index('appear_in_period', drop=True).sort_values(['field', 'period', 'date'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f98e15f7-d114-47f1-ba36-38c7a8a08d1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2.reset_index('appear_in_period', drop=True).sort_values(['field', 'period', 'date']).to_csv('_data/test/source/SH600000.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a3c0a785-061a-45e8-9fe9-aaa39b00a60d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import qlib\n",
+    "from qlib.data import D\n",
+    "from qlib.constant import REG_CN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "13ea0543-7d04-4456-92a1-4652feae7044",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[2946:MainThread](2022-07-08 17:42:44,996) INFO - qlib.Initialization - [config.py:413] - default_conf: client.\n",
+      "[2946:MainThread](2022-07-08 17:42:46,508) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.\n",
+      "[2946:MainThread](2022-07-08 17:42:46,511) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/home/guofu/Workspaces/guofu/TslDataFeed/_data/test/target')}\n"
+     ]
+    }
+   ],
+   "source": [
+    "qlib.init(provider_uri='_data/test/target/', region=REG_CN)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "e8a3b391-47b1-484b-b359-3ebaa303ade3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>$open</th>\n",
+       "      <th>$close</th>\n",
+       "      <th>P($$净利润_q)</th>\n",
+       "      <th>PRef($$净利润_q, 202004)</th>\n",
+       "      <th>PRef($$净利润（调整）_q, 201804)</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>instrument</th>\n",
+       "      <th>datetime</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"25\" valign=\"top\">sh600000</th>\n",
+       "      <th>2021-03-26</th>\n",
+       "      <td>10.68</td>\n",
+       "      <td>10.62</td>\n",
+       "      <td>4.519000e+10</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-29</th>\n",
+       "      <td>10.60</td>\n",
+       "      <td>10.48</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-30</th>\n",
+       "      <td>10.65</td>\n",
+       "      <td>11.01</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-31</th>\n",
+       "      <td>11.01</td>\n",
+       "      <td>10.99</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-01</th>\n",
+       "      <td>10.99</td>\n",
+       "      <td>10.89</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-02</th>\n",
+       "      <td>10.94</td>\n",
+       "      <td>10.76</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-06</th>\n",
+       "      <td>10.76</td>\n",
+       "      <td>10.70</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-07</th>\n",
+       "      <td>10.74</td>\n",
+       "      <td>10.65</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-08</th>\n",
+       "      <td>10.63</td>\n",
+       "      <td>10.66</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-09</th>\n",
+       "      <td>10.64</td>\n",
+       "      <td>10.59</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-12</th>\n",
+       "      <td>10.58</td>\n",
+       "      <td>10.73</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-13</th>\n",
+       "      <td>10.74</td>\n",
+       "      <td>10.76</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-14</th>\n",
+       "      <td>10.82</td>\n",
+       "      <td>10.59</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-15</th>\n",
+       "      <td>10.58</td>\n",
+       "      <td>10.43</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-16</th>\n",
+       "      <td>10.45</td>\n",
+       "      <td>10.54</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-19</th>\n",
+       "      <td>10.55</td>\n",
+       "      <td>10.58</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-20</th>\n",
+       "      <td>10.55</td>\n",
+       "      <td>10.50</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-21</th>\n",
+       "      <td>10.47</td>\n",
+       "      <td>10.58</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-22</th>\n",
+       "      <td>10.60</td>\n",
+       "      <td>10.43</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-23</th>\n",
+       "      <td>10.42</td>\n",
+       "      <td>10.43</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-26</th>\n",
+       "      <td>10.45</td>\n",
+       "      <td>10.30</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-27</th>\n",
+       "      <td>10.33</td>\n",
+       "      <td>10.27</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-28</th>\n",
+       "      <td>10.33</td>\n",
+       "      <td>10.25</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-29</th>\n",
+       "      <td>10.28</td>\n",
+       "      <td>10.54</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-30</th>\n",
+       "      <td>10.28</td>\n",
+       "      <td>10.05</td>\n",
+       "      <td>1.885700e+10</td>\n",
+       "      <td>5.899300e+10</td>\n",
+       "      <td>5.651500e+10</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       $open  $close    P($$净利润_q)  PRef($$净利润_q, 202004)  \\\n",
+       "instrument datetime                                                         \n",
+       "sh600000   2021-03-26  10.68   10.62  4.519000e+10                    NaN   \n",
+       "           2021-03-29  10.60   10.48  5.899300e+10           5.899300e+10   \n",
+       "           2021-03-30  10.65   11.01  5.899300e+10           5.899300e+10   \n",
+       "           2021-03-31  11.01   10.99  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-01  10.99   10.89  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-02  10.94   10.76  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-06  10.76   10.70  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-07  10.74   10.65  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-08  10.63   10.66  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-09  10.64   10.59  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-12  10.58   10.73  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-13  10.74   10.76  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-14  10.82   10.59  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-15  10.58   10.43  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-16  10.45   10.54  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-19  10.55   10.58  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-20  10.55   10.50  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-21  10.47   10.58  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-22  10.60   10.43  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-23  10.42   10.43  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-26  10.45   10.30  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-27  10.33   10.27  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-28  10.33   10.25  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-29  10.28   10.54  5.899300e+10           5.899300e+10   \n",
+       "           2021-04-30  10.28   10.05  1.885700e+10           5.899300e+10   \n",
+       "\n",
+       "                       PRef($$净利润（调整）_q, 201804)  \n",
+       "instrument datetime                               \n",
+       "sh600000   2021-03-26               5.651500e+10  \n",
+       "           2021-03-29               5.651500e+10  \n",
+       "           2021-03-30               5.651500e+10  \n",
+       "           2021-03-31               5.651500e+10  \n",
+       "           2021-04-01               5.651500e+10  \n",
+       "           2021-04-02               5.651500e+10  \n",
+       "           2021-04-06               5.651500e+10  \n",
+       "           2021-04-07               5.651500e+10  \n",
+       "           2021-04-08               5.651500e+10  \n",
+       "           2021-04-09               5.651500e+10  \n",
+       "           2021-04-12               5.651500e+10  \n",
+       "           2021-04-13               5.651500e+10  \n",
+       "           2021-04-14               5.651500e+10  \n",
+       "           2021-04-15               5.651500e+10  \n",
+       "           2021-04-16               5.651500e+10  \n",
+       "           2021-04-19               5.651500e+10  \n",
+       "           2021-04-20               5.651500e+10  \n",
+       "           2021-04-21               5.651500e+10  \n",
+       "           2021-04-22               5.651500e+10  \n",
+       "           2021-04-23               5.651500e+10  \n",
+       "           2021-04-26               5.651500e+10  \n",
+       "           2021-04-27               5.651500e+10  \n",
+       "           2021-04-28               5.651500e+10  \n",
+       "           2021-04-29               5.651500e+10  \n",
+       "           2021-04-30               5.651500e+10  "
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "D.features(['sh600000'], ['$open', '$close', 'P($$净利润_q)', 'PRef($$净利润_q, 202004)', 'PRef($$净利润（调整）_q, 201804)'], start_time='2021-03-26', end_time='2021-05-02', freq=\"day\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "5af83fe1-b435-4f6d-aab1-9f3f4406c965",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>$open</th>\n",
+       "      <th>$close</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>instrument</th>\n",
+       "      <th>datetime</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"25\" valign=\"top\">sh600000</th>\n",
+       "      <th>2021-03-26</th>\n",
+       "      <td>10.68</td>\n",
+       "      <td>10.62</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-29</th>\n",
+       "      <td>10.60</td>\n",
+       "      <td>10.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-30</th>\n",
+       "      <td>10.65</td>\n",
+       "      <td>11.01</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-03-31</th>\n",
+       "      <td>11.01</td>\n",
+       "      <td>10.99</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-01</th>\n",
+       "      <td>10.99</td>\n",
+       "      <td>10.89</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-02</th>\n",
+       "      <td>10.94</td>\n",
+       "      <td>10.76</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-06</th>\n",
+       "      <td>10.76</td>\n",
+       "      <td>10.70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-07</th>\n",
+       "      <td>10.74</td>\n",
+       "      <td>10.65</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-08</th>\n",
+       "      <td>10.63</td>\n",
+       "      <td>10.66</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-09</th>\n",
+       "      <td>10.64</td>\n",
+       "      <td>10.59</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-12</th>\n",
+       "      <td>10.58</td>\n",
+       "      <td>10.73</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-13</th>\n",
+       "      <td>10.74</td>\n",
+       "      <td>10.76</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-14</th>\n",
+       "      <td>10.82</td>\n",
+       "      <td>10.59</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-15</th>\n",
+       "      <td>10.58</td>\n",
+       "      <td>10.43</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-16</th>\n",
+       "      <td>10.45</td>\n",
+       "      <td>10.54</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-19</th>\n",
+       "      <td>10.55</td>\n",
+       "      <td>10.58</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-20</th>\n",
+       "      <td>10.55</td>\n",
+       "      <td>10.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-21</th>\n",
+       "      <td>10.47</td>\n",
+       "      <td>10.58</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-22</th>\n",
+       "      <td>10.60</td>\n",
+       "      <td>10.43</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-23</th>\n",
+       "      <td>10.42</td>\n",
+       "      <td>10.43</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-26</th>\n",
+       "      <td>10.45</td>\n",
+       "      <td>10.30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-27</th>\n",
+       "      <td>10.33</td>\n",
+       "      <td>10.27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-28</th>\n",
+       "      <td>10.33</td>\n",
+       "      <td>10.25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-29</th>\n",
+       "      <td>10.28</td>\n",
+       "      <td>10.54</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2021-04-30</th>\n",
+       "      <td>10.28</td>\n",
+       "      <td>10.05</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       $open  $close\n",
+       "instrument datetime                 \n",
+       "sh600000   2021-03-26  10.68   10.62\n",
+       "           2021-03-29  10.60   10.48\n",
+       "           2021-03-30  10.65   11.01\n",
+       "           2021-03-31  11.01   10.99\n",
+       "           2021-04-01  10.99   10.89\n",
+       "           2021-04-02  10.94   10.76\n",
+       "           2021-04-06  10.76   10.70\n",
+       "           2021-04-07  10.74   10.65\n",
+       "           2021-04-08  10.63   10.66\n",
+       "           2021-04-09  10.64   10.59\n",
+       "           2021-04-12  10.58   10.73\n",
+       "           2021-04-13  10.74   10.76\n",
+       "           2021-04-14  10.82   10.59\n",
+       "           2021-04-15  10.58   10.43\n",
+       "           2021-04-16  10.45   10.54\n",
+       "           2021-04-19  10.55   10.58\n",
+       "           2021-04-20  10.55   10.50\n",
+       "           2021-04-21  10.47   10.58\n",
+       "           2021-04-22  10.60   10.43\n",
+       "           2021-04-23  10.42   10.43\n",
+       "           2021-04-26  10.45   10.30\n",
+       "           2021-04-27  10.33   10.27\n",
+       "           2021-04-28  10.33   10.25\n",
+       "           2021-04-29  10.28   10.54\n",
+       "           2021-04-30  10.28   10.05"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "D.features(['sh600000'], ['$open', '$close'], start_time='2021-03-26', end_time='2021-05-02', freq=\"day\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e74b079-8c02-47f3-bc06-15081ecf91fe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/src/._database.py
+++ b/src/._database.py
--- a/src/init.py
+++ b/src/init.py
--- a/src/config.py
+++ b/src/config.py
@ -0,0 +1,7 @@
+from collections import namedtuple
+
+TINYSOFT_DATA_PATH = '../_data/tinysoft-data/'
+
+DateRange = namedtuple('DateRange', ['start_date', 'end_date'])
+
+
--- a/src/convert_to_qlib.py
+++ b/src/convert_to_qlib.py
@ -0,0 +1,345 @@
+import os
+from pathlib import Path
+from glob import glob
+import warnings
+from pprint import pprint
+
+from tqdm import tqdm
+import pandas as pd
+from pandas.core.common import SettingWithCopyWarning
+
+warnings.simplefilter('ignore', SettingWithCopyWarning)
+
+from qlib_scripts.dump_bin import *
+from tsl import tsl
+
+stock_pool_date = 20220531
+
+class Market():
+    """
+    Exceptions:
+        - SZ000856: 在日期为20190122的数据中，有值为`157567679>23`，导致解析错误，实际应为`157567679.23`
+    """
+    
+    def __init__(self):
+        self.data_shards = [
+            '20010101-20101231',
+            '20110101-20151231',
+            '20160101-20201231',
+            '20210101-20220531'
+        ]
+
+        self.tinysoft_folder = '../tinysoft-data/行情数据/{}/k-daily/'
+
+        self.qlib_source_folder = '../qlib-data/source/market/k-daily/'
+        self.qlib_target_folder = '../qlib-data/target/market/k-daily/'
+
+        self.rename_dict = {'vol':'volume', 'Factor':'tsFactor'}
+        self.non_float_cols = set(['StockID', 'date'])
+        self.symbol_field_name = 'StockID'
+        
+        # to ensure the source and target folder for the qlib-data do exist
+        Path(self.qlib_source_folder).mkdir(parents=True, exist_ok=True)
+        Path(self.qlib_target_folder).mkdir(parents=True, exist_ok=True)
+
+        
+    def prepare_qlib_source(self):
+
+        print('Will work on qlib source data.')
+
+        def _make_qlib_source(stock_id):
+            # to concat all shards for a single stock into one
+            df_list = []
+            for shard in self.data_shards:
+                _folder = self.tinysoft_folder.format(shard)
+                _fname = _folder + stock_id + '.csv'
+                if not os.path.exists(_fname):
+                    continue
+                df_list.append(pd.read_csv(_fname))
+
+            # in the rare case that the stock has never been traded
+            if len(df_list) == 0:
+                return
+
+            df_tinysoft = pd.concat(df_list, axis=0)
+
+            # to ensure features are float numbers
+            float_cols = list(set(df_tinysoft.columns.tolist()) \
+                              - self.non_float_cols)
+            df_tinysoft[float_cols] = df_tinysoft[float_cols].astype('float')
+
+            # do adjustments on the columns and date format
+            df_tinysoft.rename(columns=self.rename_dict, inplace=True)
+            df_tinysoft['date'] = pd.to_datetime(df_tinysoft['date'].astype(str), format='%Y%m%d')
+            df_tinysoft.drop(self.symbol_field_name, axis=1, inplace=True)
+
+            # NOTE:
+            # create a new `adjFactor column`, by:
+            #   factor_t = (yclose_t / close_{t-1}) * factor_{t-1}, in which
+            #   factor_{1} = 1.0
+            
+            # 1. sort rows by field `date`
+            df_tinysoft.sort_values('date', inplace=True)
+            
+            # 2. add the aux column, and init as all ones
+            df_tinysoft['yclose_t_by_close_t1'] = 1.
+
+            # 3. remove rows with `close` be `na`, to make the series compact
+            df = df_tinysoft[~df_tinysoft['close'].isna()]
+
+            # 4. calculate the aux column 
+            #   NOTE: be very careful when seting the value without deep copy the dataframe 
+            #   `reset_index` to enable the dis-aligned division
+            df['yclose_t_by_close_t1'].iloc[1:] = \
+                df['yclose'].iloc[1:].reset_index(drop=True) / \
+                df['close'].iloc[:-1].reset_index(drop=True)
+
+            # 5. calculate the final field `adjFactor`
+            #   NOTE: be very careful when seting the value without deep copy the dataframe 
+            df['adjFactor'] = df['yclose_t_by_close_t1'].cumprod()
+            df['factor'] = df['adjFactor']
+
+            # write to the qlib source folder
+            df.to_csv(self.qlib_source_folder + stock_id + '.csv', index=False)
+
+        with tsl() as ts:
+            stock_list = ts.get_stock_list(stock_pool_date)
+
+        with tqdm(stock_list) as pbar:
+            for stock_id in pbar:
+                pbar.set_description(stock_id)
+                _make_qlib_source(stock_id)
+
+            
+    def create_qlib_target(self):
+        print('Will work on qlib target data.')
+
+        # run the qlib script `dump_bin` to finish the conversion
+        # the original script is wrapped by python fire package to enable command line interface
+        # now we invoke the call explictly from the program
+        DumpDataAll(
+            csv_path=self.qlib_source_folder,
+            qlib_dir=self.qlib_target_folder,
+            max_workers=4,
+            symbol_field_name=self.symbol_field_name
+        )()
+
+
+        
+class Financial():
+    
+    # TODO: 
+    #   1. read the column mapping config file to supplement the feature name mapping
+    #   2. merge all dimension tables into one big feature table for each stock
+
+    def __init__(self):
+        self.tinysoft_report_folder = '../tinysoft-data/基础报表/'
+        self.tinysoft_indicator_folder = '../tinysoft-data/衍生指标/'
+        self.tinysoft_report_config_fname = os.path.join(
+                self.tinysoft_report_folder, '天软中英文字段对照表.xlsx')
+
+        self.qlib_source_folder = '../qlib-data/source/pit'
+        self.qlib_source_norm_folder = '../qlib-data/source/pit_normalized'
+        
+        self.qlib_target_folder = '../qlib-data/target/market/k-daily/'
+
+        # 虽然这些字段可以在`dump_pit`中进行设置，但保持和默认值一致的命名规则有助于减少错误
+        self.meta_rename_dict = {
+            'StockID':'symbol',                 # 
+            '数据报告期':'appear_in_period',    # 财报对应的报告期，但同一期财报中可能对历史财报进行修正
+            '公布日':'date',                    # 公布日之后，公开信息才能看到当期报告中的所有数据
+            '截止日':'period',                  # 公布的数据对应的报告期，如果`截止日 < 报告期`，则为修正财报
+            '预案公布日':'date'
+        }
+        
+        # 映射后的symbol字段，所以非常重要：需要先做映射，然后在进行列操作
+        self.symbol_field_name = 'symbol'
+
+        # 不需要被编入qlib作为feature的字段，要么因为是meta信息，要么是其他辅助说明
+        self.excluding_cols = set([
+            'symbol', 'appear_in_period', 'date', 'period',         # meta信息
+            '会计准则', '备注'                                      # 字符串格式的辅助说明
+        ])
+
+        self.index_cols = set(['symbol', 'date', 'period'])
+
+        self.report_tables = {
+            #18 : '分红送股',
+            #40 : '业绩预测',
+            #41 : '业绩快报',
+            #42 : '主要财务指标',
+            44 : '合并资产负债表',
+            46 : '合并利润分配表',
+            48 : '合并现金流量表',
+            #56 : '金融公司资产负债表',
+            #58 : '金融公司利润分配表',
+            #60 : '金融公司现金流量表',
+        }
+
+
+    def _get_export_cols(self, df, dim_name):
+        df_config = pd.read_excel(self.tinysoft_report_config_fname, dim_name)
+        float_cols = set(df_config[df_config['数据类型'] == 'FLOAT']['天软字段'].tolist())
+        return list(set(df.columns.tolist()).intersection(float_cols))
+
+
+    def _get_index_cols(self, df):
+        return list(set(df.columns.tolist()).intersection(self.index_cols))
+
+
+    def _get_excluding_cols(self, df):
+        return list(set(df.columns.tolist()).intersection(self.excluding_cols))
+
+    
+    def _make_report_source_dim(self, df, dim_name):
+        """
+        Working on a single stock, single dimension. 
+
+        Args:
+            df: (pandas.DataFrame) Data of the specific stock and specific dimension.
+            dim_name: (str) Name of the dimension currently working with.
+
+        Return:
+            (pandas.DataFrame) A long table from the transformation.
+        """
+        # 1. re-name the key columns into canonical form
+        df.rename(columns=self.meta_rename_dict, inplace=True)
+
+        # 2. select only the rows with `period == appear_in_period`
+        #   if the column `appear_in_period` exists
+        if 'appear_in_period' in df.columns:
+            df = df[df['appear_in_period'] == df['period']]
+
+        # 3. set index columns
+        df.set_index(self._get_index_cols(df), inplace=True)
+ 
+        # 4. loop through export columns to make dataframe list
+        df_list = []
+        for col in self._get_export_cols(df, dim_name):
+            # index by list, to ensure a dataframe is returned
+            # use deep copy to ensure modification won't affect the original dataframe 
+            #   (and also supress the warning)
+            df_tmp = df[[col]].copy(deep=True)
+
+            # create a new column `field` to tell the feature name
+            df_tmp['field'] = col
+
+            # rename the original column that has the feature value to `value`
+            df_tmp.rename(columns={col:'value'}, inplace=True)
+            
+            # append to the dataframe list for further concat
+            df_list.append(df_tmp)
+
+        # 5. concate all small dataframes into a long one
+        df = pd.concat(df_list, axis=0)
+        # shortcut return if it is an empty dataframe
+        if len(df) == 0:
+            return df
+
+        # 6. ensure the `date` and `period` are in the correct format
+        df.reset_index(inplace=True)
+        df['period'] = pd.to_datetime(df['period'].astype(str), format='%Y%m%d')
+        df['date'] = pd.to_datetime(df['date'].astype(str), format='%Y%m%d')
+        df.set_index(self._get_index_cols(df), inplace=True)
+
+        return df
+
+
+    def prepare_report_source(self):
+        """
+        Example output:
+        ~~~
+            date,period,value,field,symbol
+            2007-03-28,2006-12-31,-0.519,roeWa,sz000725
+            2011-04-15,2010-12-31,-0.1094,roeWa,sz000725
+            2012-04-14,2011-12-31,0.0223,roeWa,sz000725
+            2013-04-01,2012-12-31,0.0097,roeWa,sz000725
+        ~~~
+
+        Exceptions:
+            - SH600460: 利润表有`,`被编码成了`>`
+            - SH600766: 利润表有`20190828`被编码成了`201908"8`
+        """
+        
+        # create dir if not exist
+        Path(self.qlib_source_folder).mkdir(parents=True, exist_ok=True)
+
+        # loop 1: for each stock
+        with tsl() as ts:
+            stock_list = ts.get_stock_list(stock_pool_date)
+            
+        with tqdm(stock_list) as pbar:
+            # loop 1: for each stock
+            for stock_id in pbar:
+                pbar.set_description(stock_id)
+                
+                # loop 2: for each dimension table
+                df_list = []
+                for tb_id, tb_name in self.report_tables.items():
+                    dir_name = '{}.{}'.format(tb_id, tb_name) 
+                    fname_csv = os.path.join('{}/{}/{}.csv'.format(
+                            self.tinysoft_report_folder, dir_name, stock_id))
+                    #pbar.set_description(stock_id + '/' + tb_name)
+
+                    if not os.path.exists(fname_csv):
+                        continue
+
+                    df = pd.read_csv(fname_csv)
+                    # for basic financial reports, `dim_name` is just `tb_name`
+                    df_dim = self._make_report_source_dim(df, tb_name)
+                    df_dim['value'] = df_dim['value'].astype('float')
+                    df_list.append(df_dim)
+                # end of loop 2
+
+                if len(df_list) == 0:
+                    continue
+                
+                df = pd.concat(df_list, axis=0)
+                # dump the csv to file: qlib_source_folder + stock_id + '.csv'
+                df.to_csv(os.path.join(self.qlib_source_folder, stock_id + '.csv'))
+
+
+    def create_qlib_source_norm(self):
+        """
+        if not p.exists():
+            !cd ../../scripts/data_collector/pit/ && pip install -r requirements.txt
+            !cd ../../scripts/data_collector/pit/ && python collector.py download_data --source_dir ~/.qlib/stock_data/source/pit --start 2000-01-01 --end 2020-01-01 --interval quarterly --symbol_regex "^(600519|000725).*"
+            !cd ../../scripts/data_collector/pit/ && python collector.py normalize_data --interval quarterly --source_dir ~/.qlib/stock_data/source/pit --normalize_dir ~/.qlib/stock_data/source/pit_normalized
+            !cd ../../scripts/ && python dump_pit.py dump --csv_path ~/.qlib/stock_data/source/pit_normalized --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly
+            pass
+        """
+        pass        
+
+        #Run(
+        #    interval='quaterly',
+        #    source_dir=self.qlib_source_folder,
+        #    normalize_dir=self.qlib_source_norm_folder,
+        #    max_workers=4,
+        #    #date_field_name='date',
+        #    #symbol_field_name='symbol'
+        #).normalize_data()
+
+    def create_qlib_target(self):
+        print('Will work on qlib target data.')
+
+        # run the qlib script `dump_bin` to finish the conversion
+        # the original script is wrapped by python fire package to enable command line interface
+        # now we invoke the call explictly from the program
+        DumpPitData(
+            csv_path=self.qlib_source_norm_folder,
+            qlib_dir=self.qlib_target_folder,
+            max_workers=4,
+        )()
+
+
+
+if __name__ == '__main__':
+    
+    #f = Financial()
+    #f.prepare_report_source()
+    #f.create_qlib_source_norm()
+    
+    m = Market()
+    #m.prepare_qlib_source()
+    m.create_qlib_target()
+    
--- a/src/database.py
+++ b/src/database.py
@ -0,0 +1,180 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# # 导入sqlserver
+
+# In[77]:
+
+
+import pandas as pd
+import os
+import numpy as np
+from sqlalchemy import create_engine
+from sqlalchemy.types import CHAR, INT, VARCHAR, FLOAT, TEXT
+import time
+
+filelist = os.listdir('D:/数据/天软基本面数据/46.合并利润分配表')
+engine = create_engine('mssql+pymssql://sa:admin@192.168.1.128/test')
+conn = engine.connect()
+for i in range(2451, 2452):
+    file = filelist[i]
+    data = pd.read_csv('D:/数据/天软基本面数据/46.合并利润分配表/' + file)
+    data['备注'] = data['备注'].astype(str) # 强制转换备注列为str格式
+#     data['主营业务利润'] = data['主营业务利润'].astype(np.float64)
+#     data['预警详情'] = data['预警详情'].astype(str)
+    title_list = pd.read_excel('D:/数据/天软基本面数据/wind-天软字段匹配/天软中英文字段对照表.xlsx',
+                              '合并利润分配表')
+    eng_name = title_list['英文名']
+    dtype = title_list['数据类型']
+    data.columns = eng_name # 替换data字段名为英文字段
+    dtype_dict = {'INT': INT(),
+                'CHAR(8)': CHAR(8),
+                'VARCHAR(20)': VARCHAR(20),
+                'TEXT': TEXT(),
+                'FLOAT': FLOAT()} # 原表数据类型和sql数据类型转换映射
+    dtype_list = dtype.map(dtype_dict)
+    dtype_list = pd.concat([eng_name, dtype_list], axis=1)
+    dtype_list = dtype_list.set_index('英文名').to_dict()['数据类型']
+    data.to_sql('CONSOLIDATED_INCOMESTATEMENT', 
+                 conn, 
+                 index=False, 
+                 if_exists='append', 
+                 dtype=dtype_list)
+    time.sleep(0.1)
+conn.close()
+engine.dispose()
+
+
+# # 转换列属性
+
+# In[ ]:
+
+
+import pymssql
+conn = pymssql.connect('192.168.1.128', 'sa', 'admin', "test") #tempdb is a database
+cursor = conn.cursor()
+sqls = ['ALTER TABLE EARNINGS_PREANNOUNCEMENT ALTER COLUMN StockID CHAR(8) NOT NULL',
+       'ALTER TABLE EARNINGS_PREANNOUNCEMENT ALTER COLUMN S_PROFITNOTICE_PERIOD INT NOT NULL',
+#        'ALTER TABLE PRELIMINARY_EARNING_EST ALTER COLUMN REPORT_DATE INT NOT NULL',
+       'ALTER TABLE EARNINGS_PREANNOUNCEMENT ALTER COLUMN S_PROFITNOTICE_DATE INT NOT NULL',
+       '''ALTER TABLE EARNINGS_PREANNOUNCEMENT ADD CONSTRAINT STOCK_EP_ID PRIMARY KEY
+       (StockID,S_PROFITNOTICE_PERIOD,S_PROFITNOTICE_DATE)''',
+       'CREATE NONCLUSTERED INDEX STOCK_INTID ON EARNINGS_PREANNOUNCEMENT (StockID_INT)']
+for sql in sqls:
+    cursor.execute(sql) # create a new sheet
+conn.commit()
+conn.close()
+
+
+# In[60]:
+
+
+conn = pymssql.connect('192.168.1.128', 'sa', 'admin', "test") #tempdb is a database
+cursor = conn.cursor()
+sql1 = '''
+sp_rename 'DIVIDEND_ANNOUNCEMENT.REPORT_PERIOD', 'ReportPeriod', 'column'
+'''
+# sql2 = '''
+# sp_rename 'DIVIDEND_ANNOUNCEMENT.S_PROFITNOTICE_DATE', 'AppearAtDate', 'column'
+# '''
+sql3 = '''
+drop index STOCK_INTID on DIVIDEND_ANNOUNCEMENT 
+'''
+sql4 = '''
+alter table DIVIDEND_ANNOUNCEMENT drop column StockID_INT
+'''
+cursor.execute(sql1)
+# cursor.execute(sql2)
+cursor.execute(sql3)
+cursor.execute(sql4)
+conn.commit()
+conn.close()
+
+
+# # 创建view
+
+# In[184]:
+
+
+conn = pymssql.connect('192.168.1.128', 'sa', 'admin', "test") #tempdb is a database
+cursor = conn.cursor()
+# sql = '''
+# CREATE VIEW LAYER1 AS
+# SELECT A.StockID, A.REPORT_PERIOD, A.ACTUAL_ANN_DT FROM
+# (
+# SELECT *, ROW_NUMBER() OVER(PARTITION BY StockID, REPORT_PERIOD order by StockID, REPORT_PERIOD) 
+# AS RowNumber FROM FINCOMP_CASHFLOWSTATEMENT
+# )A
+# WHERE A.RowNumber=1
+# '''
+sql = '''
+CREATE VIEW DIVMeta AS
+SELECT StockID,
+RIGHT(StockID,6) + '.' + LEFT(StockID,2) AS WIND_CODE, 
+CONVERT(INT, RIGHT(StockID, 6)) AS IntCode,
+ReportPeriod FROM DIVIDEND_ANNOUNCEMENT
+'''
+cursor.execute(sql)
+conn.commit()
+conn.close()
+
+
+# In[185]:
+
+
+import re
+wind_index = pd.read_excel('D:/数据/天软基本面数据/wind-天软字段匹配/已匹配索引总表.xlsx',
+                              '分红送股')
+view_column = list(wind_index['字段名']) # wind英文字段, AS后字段
+view_joinindex = list(wind_index['天软字段']) # 天软对应wind中文字段和计算字段
+ts_index = pd.read_excel('D:/数据/天软基本面数据/wind-天软字段匹配/天软中英文字段对照表.xlsx',
+                              '分红送股')
+sql_matching = []
+for i in range(len(view_column)):
+    if 'exp' in view_joinindex[i]:
+        expresion = view_joinindex[i]
+        ts_columns = re.split('\$| ', expresion)
+        ts_columns = [col for col in ts_columns if u'\u4e00' <= col <= u'\u9fff'] #解析出所有中文字段
+        for col in ts_columns:
+            expresion = expresion.replace(col, 'B.' + ts_index.loc[ts_index['天软字段'] == col, '英文名'].values[0])
+        expresion = expresion.replace('exp:', '')
+        expresion = expresion.replace('$', '')
+        sql_matching.append(expresion)
+        sql_matching.append(' AS ' + view_column[i] + ',')
+    else:
+        sql_matching.append('B.' + ts_index.loc[ts_index['天软字段'] == view_joinindex[i], '英文名'].values[0] + 
+                            ' AS ' + view_column[i] + ',')
+sql_matching[-1] = sql_matching[-1].replace(',', '')
+sql_matching = ''.join(sql_matching)
+
+
+# In[186]:
+
+
+conn = pymssql.connect('192.168.1.128', 'sa', 'admin', "test") #tempdb is a database
+cursor = conn.cursor()
+sql = 'CREATE VIEW DIVWind AS ' +         'SELECT A.WIND_CODE, A.IntCode, A.ReportPeriod, ' +         sql_matching + ' FROM DIVMeta A LEFT JOIN DIVIDEND_ANNOUNCEMENT B ON ' +         'A.StockID = B.StockID AND A.ReportPeriod = B.ReportPeriod'
+cursor.execute(sql)
+conn.commit()
+conn.close()
+
+
+# # 删除view
+
+# In[134]:
+
+
+import pymssql
+conn = pymssql.connect('192.168.1.128', 'sa', 'admin', "test") #tempdb is a database
+cursor = conn.cursor()
+# sql = '''
+# SELECT RIGHT(StockID,6) + '.' + LEFT(StockID,2) FROM CONSOLIDATED_BALANCESHEET
+# '''
+sql = '''
+IF EXISTS(SELECT * FROM SYS.VIEWS WHERE NAME='CBSBeforeAdj')
+DROP VIEW CBSBeforeAdj
+'''
+cursor.execute(sql)
+conn.commit()
+conn.close()
+
--- a/src/financial.py
+++ b/src/financial.py
@ -0,0 +1,135 @@
+import os, time
+from tqdm import tqdm
+import itertools
+from pathlib import Path
+
+from collections import namedtuple
+
+import numpy as np
+import pandas as pd
+
+from tsl import *
+from config import *
+
+class DLFinancial:
+
+    def __init__(self):
+
+        self.table_id_to_table_name = {
+            18 : '分红送股',
+            40 : '业绩预测',
+            41 : '业绩快报',
+            42 : '主要财务指标',
+            44 : '合并资产负债表',
+            46 : '合并利润分配表',
+            48 : '合并现金流量表',
+            56 : '金融公司资产负债表',
+            58 : '金融公司利润分配表',
+            60 : '金融公司现金流量表',
+        }
+
+        self.table_id_to_index_list = {
+            18 : ['StockID', 'StockName', '截止日'],
+            40 : ['StockID', 'StockName', '截止日', '公布日'],
+            41 : ['StockID', 'StockName', '截止日', '公布日'],
+            42 : ['StockID', 'StockName', '截止日', '公布日'],
+            44 : ['StockID', 'StockName', '截止日', '数据报告期', '公布日'],
+            46 : ['StockID', 'StockName', '截止日', '数据报告期', '公布日'],
+            48 : ['StockID', 'StockName', '截止日', '数据报告期', '公布日'],
+            56 : ['StockID', 'StockName', '截止日', '数据报告期', '公布日'],
+            58 : ['StockID', 'StockName', '截止日', '数据报告期', '公布日'],
+            60 : ['StockID', 'StockName', '截止日', '数据报告期', '公布日'],
+        }
+
+        self.config_name_list = [
+            '每股指标',
+            '盈利能力',
+            '偿债能力',
+            '资本结构',
+            '经营能力',
+            '投资收益',
+            '成长能力',
+            '现金流指标',
+            '资产负债表结构',
+            '利润分配表结构',
+            '现金流量表结构',
+            '估值指标',
+        ]
+
+
+    def do_fin_report(self):
+        for table_id in self.table_id_to_table_name.keys():
+            self._dump_fin_report(table_id, 20000101)
+
+
+    def _dump_fin_report(self, table_id, report_start_date):
+        
+        table_name = self.table_id_to_table_name[table_id]
+        dump_folder = '{}/基础报表/{}.{}/'.format(TINYSOFT_DATA_PATH, table_id, table_name)
+        Path(dump_folder).mkdir(parents=True, exist_ok=True)
+        
+        def _dump_df_to_csv(dump_folder, stock_id, df):
+            dump_path = '{}/{}.csv'.format(dump_folder, stock_id)
+
+            index_cols = self.table_id_to_index_list[table_id]
+            df.set_index(index_cols, inplace=True)
+            df.to_csv(dump_path)  
+
+        with tsl() as ts:
+            stock_list = ts.get_stock_list()
+            print('正在获取数据：', table_id, table_name)
+
+            with tqdm(stock_list) as pbar:
+                for stock_id in pbar:
+                    pbar.set_description(dump_folder + stock_id)
+
+                    df = ts.get_cmp_report(
+                        table_id=table_id, 
+                        stock_id=stock_id, 
+                        start_date=report_start_date
+                    )
+
+                    if df.shape[0] == 0 or df.shape[1] == 0:
+                        print('{}的{}为空数据'.format(stock_id, table_name))
+                        continue
+
+                    _dump_df_to_csv(dump_folder, stock_id, df)
+                    time.sleep(0.001)
+
+
+    def _dump_common_info(self):
+        pass
+
+
+    def _dump_derived_indicators(self, report_start_date):
+                
+        def _dump_df_to_csv(config_name, stock_id, df):
+            dump_path = '{}/衍生指标/{}/{}.csv'.format(
+                TINYSOFT_DATA_PATH, config_name, stock_id)
+            df.to_csv(dump_path)
+
+        with tsl() as ts:
+            stock_list = ts.get_stock_list()
+
+            for config_name in config_name_list:
+                print('正在获取数据：', config_name)
+                
+                with tqdm(stock_list) as pbar:
+                    for stock_id in pbar:
+                        pbar.set_description(stock_id)
+                        
+                        df = ts.get_cmp_indicator(
+                            stock_id=stock_id, 
+                            start_year=report_start_year, 
+                            indicator_config_fname='{}/indicator-config/{}.csv'.format(
+                                TINYSOFT_DATA_PATH,
+                                config_name
+                            )
+                        )
+                        _dump_df_to_csv(config_name, stock_id, df)
+                        time.sleep(0.001)
+
+
+if __name__ == '__main__':
+    DLFinancial().do_fin_report()
+
--- a/src/market.py
+++ b/src/market.py
@ -0,0 +1,151 @@
+import os, time
+from tqdm import tqdm
+import itertools
+from pathlib import Path
+
+from collections import namedtuple
+
+import numpy as np
+import pandas as pd
+
+from tsl import *
+from config import *
+
+class DLMarket():
+    
+    k_daily_data_shards = [
+        DateRange(20000101, 20091231),
+        DateRange(20100101, 20191231),
+        DateRange(20200101, 20220630)
+    ]
+
+
+    def __init__(self):
+        pass       
+
+    def do_k_daily(self):
+        for date_range in self.k_daily_data_shards:
+            start_date, end_date = date_range['start_date'], date_range['end_date']
+            self._dump_mkt_k_daily(start_date, end_date)
+
+
+    def do_k_1min(self):
+        start_date, end_date= 20210701, 20211231
+        self._dump_mkt_k_1min(start_date, end_date)
+
+
+    def _dump_mkt_k_1min(self, start_date, end_date):
+        """
+        For some reason, 1min k cannot be returned by a date range, 
+        or the program is prune to give address boundary error, 
+        which should be an issue of the tinysoft API. 
+        
+        Thus, we have to do the query day by day. 
+
+        But, maybe is is possible to query a range of stock for one day, 
+        which has about 1M rows/day (depends on the #stocks on list that day). 
+        """
+        print('准备构建1分钟K线数据', start_date, end_date)
+        with tsl() as t:
+            date_list = t.get_mkt_trading_days(start_date, end_date)
+            
+        with tqdm(date_list) as pbar:
+            for date in pbar:
+                pbar.set_description('正在获取当日1分钟K线数据：' + str(date))
+
+                dump_folder = '{}/行情数据/分钟K线/{}/'.format(
+                    TINYSOFT_DATA_PATH, 
+                    date,
+                )
+                # ensure the folder for the shard does exist
+                Path(dump_folder).mkdir(parents=True, exist_ok=True)
+                
+                with tsl() as t:
+                    stock_list = t.get_stock_list(date)
+                    r = t.get_mkt_stock_k_1min(date, stock_list)
+                
+                pbar.set_description('当日数据已经获取，准备创建DataFrame，' + str(date))
+                df = pd.DataFrame(r.value())
+
+                pbar.set_description('DataFrame已经创建完成，分块后写入磁盘，' + str(date))
+
+                if len(df.index) == 0:
+                    print('因数据缺失，跳过', date)
+                    print(r.message)
+                    time.sleep(0.1)
+                    continue
+                
+                for stock_id, df_stock in df.groupby('StockID'):
+                    df_stock.set_index(['StockID', 'time'], inplace=True)
+                    df_stock.to_csv('{}/{}.csv'.format(dump_folder, stock_id))
+                    # to avoid massive concurrent disk writing
+                    time.sleep(0.001)
+
+                del(df)
+                # 由于不断重新链接tsl客户端，所以每次完成之后需要等待更长的时间，特别是在获取数据缺失的情况下
+                time.sleep(0.1)
+        
+
+    def _dump_mkt_k_daily(self, start_date, end_date):
+        
+        dump_folder = dump_path = '{}/行情数据/日K线/shards/{}/'.format(
+            TINYSOFT_DATA_PATH, 
+            str(start_date) + '-' + str(end_date),
+        )
+        Path(dump_folder).mkdir(parents=True, exist_ok=True)
+        
+        with tsl() as t:
+            stock_list = t.get_stock_list()
+            print('正在获取日频行情数据：')
+            
+            with tqdm(stock_list) as pbar:
+                for stock_id in pbar:
+                    dump_path = '{}/{}.csv'.format(dump_folder, stock_id)
+                    pbar.set_description(dump_path)
+                    
+                # 每次重新启动tsl客户端，以免大量数据在tsl客户端内累积，会导致segment fault
+                # 在更稳定的服务器条件下，有可能不需要反复重新连接tsl，这里尝试直接使用原有tsl对象
+                # with tsl() as t:
+                    df1, r1 = t.get_mkt_stock_k_daily(start_date, end_date, stock_id)
+                    df2, r2 = t.get_mkt_stock_k_daily_ext(start_date, end_date, stock_id)
+                
+                    if len(df1) == 0 or len(df2) == 0:
+                        print('因数据缺失，跳过', stock_id)
+                        # 由于不断重新链接tsl客户端，所以每次完成之后需要等待更长的时间
+                        # 特别是在获取数据缺失的情况下需要额外等待一定时间
+                        time.sleep(0.05)
+                        continue
+                        
+                    df = pd.concat([df1, df2], axis=1)
+                    df.to_csv(dump_path)
+                    del(df)
+                    # 由于不断重新链接tsl客户端，所以每次完成之后需要等待更长的时间，特别是在获取数据缺失的情况下
+                    time.sleep(0.005)
+                
+                
+    def _dump_mkt_calendar(self, start_date, end_date):
+        
+        def _dump_df_to_csv(series):
+            dump_path = '{}/行情数据/{}/calendar.csv'.format(
+                TINYSOFT_DATA_PATH,
+                str(start_date) + '-' + str(end_date),
+            )
+            series.to_csv(dump_path, index=False)
+            
+        with tsl() as t:
+            date_list = t.get_mkt_trading_days(
+                start_date, end_date)
+            
+        s = pd.Series(date_list, dtype=np.int64, name='date')
+        _dump_df_to_csv(s)
+    
+
+
+
+    
+
+if __name__ == '__main__':
+    #do_k_1min()
+    #do_fin_report()
+    Market().do_k_daily()
+   
--- a/src/merge_shards.py
+++ b/src/merge_shards.py
@ -0,0 +1,61 @@
+import os.path
+
+import pandas as pd
+from tqdm import tqdm
+
+from tsl import *
+from market import *
+from config import *
+
+def merge_k_daily():
+    
+    shard_dump_folder = '{}/行情数据/日K线/shards/'.format(TINYSOFT_DATA_PATH)
+    merged_dump_folder = '{}/行情数据/日K线/merged/'.format(TINYSOFT_DATA_PATH)
+
+    DLMarket.k_daily_data_shards
+
+    with tsl() as t:
+       stock_list = t.get_stock_list()
+
+    with tqdm(stock_list) as pbar:
+        
+        for stock_id in pbar:
+            pbar.set_description('正在处理', stock_id)
+
+            df_list = []
+            for date_range in Market.k_daily_data_shards:
+                
+                start_date, end_date = \
+                        date_range[0], date_range[1]
+                shard_name = str(start_date) + '-' + str(end_date)
+                
+                load_path = '{}/{}/{}.csv'.format(
+                    shard_dump_folder,
+                    shard_name,
+                    stock_id
+                )
+
+                pbar.set_description('正在载入', load_path)
+                if not os.path.exists(load_path):
+                    continue
+
+                _df = pd.read_csv(load_path)
+
+                if len(_df) > 0:
+                    df_list.append(_df)
+
+            pbar.set_description('正在拼接', stock_id)
+            if len(df_list) > 0:
+                df = pd.concat(df_list, axis=0)
+                df.set_index(['StockID', 'date'], inplace=True)
+
+                dump_path = '{}/{}.csv'.format(
+                    merged_dump_folder,
+                    stock_id
+                )
+                df.to_csv(dump_path)
+
+
+
+if __name__ == '__main__':
+    merge_k_daily()
--- a/src/qlib_scripts/README.md
+++ b/src/qlib_scripts/README.md
@ -0,0 +1,69 @@
+
+- [Download Qlib Data](#Download-Qlib-Data)
+  - [Download CN Data](#Download-CN-Data)
+  - [Download US Data](#Download-US-Data)
+  - [Download CN Simple Data](#Download-CN-Simple-Data)
+  - [Help](#Help)
+- [Using in Qlib](#Using-in-Qlib)
+  - [US data](#US-data)
+  - [CN data](#CN-data)
+
+
+## Download Qlib Data
+
+
+### Download CN Data
+
+```bash
+# daily data
+python get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
+
+# 1min  data (Optional for running non-high-frequency strategies)
+python get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --region cn --interval 1min
+```
+
+### Download US Data
+
+
+```bash
+python get_data.py qlib_data --target_dir ~/.qlib/qlib_data/us_data --region us
+```
+
+### Download CN Simple Data
+
+```bash
+python get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --region cn
+```
+
+### Help
+
+```bash
+python get_data.py qlib_data --help
+```
+
+## Using in Qlib
+> For more information: https://qlib.readthedocs.io/en/latest/start/initialization.html
+
+
+### US data
+
+> Need to download data first: [Download US Data](#Download-US-Data)
+
+```python
+import qlib
+from qlib.config import REG_US
+provider_uri = "~/.qlib/qlib_data/us_data"  # target_dir
+qlib.init(provider_uri=provider_uri, region=REG_US)
+```
+
+### CN data
+
+> Need to download data first: [Download CN Data](#Download-CN-Data)
+
+```python
+import qlib
+from qlib.constant import REG_CN
+
+provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+qlib.init(provider_uri=provider_uri, region=REG_CN)
+```
--- a/src/qlib_scripts/check_dump_bin.py
+++ b/src/qlib_scripts/check_dump_bin.py
@ -0,0 +1,143 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+
+import qlib
+from qlib.data import D
+
+import fire
+import datacompy
+import pandas as pd
+from tqdm import tqdm
+from loguru import logger
+
+
+class CheckBin:
+
+    NOT_IN_FEATURES = "not in features"
+    COMPARE_FALSE = "compare False"
+    COMPARE_TRUE = "compare True"
+    COMPARE_ERROR = "compare error"
+
+    def __init__(
+        self,
+        qlib_dir: str,
+        csv_path: str,
+        check_fields: str = None,
+        freq: str = "day",
+        symbol_field_name: str = "symbol",
+        date_field_name: str = "date",
+        file_suffix: str = ".csv",
+        max_workers: int = 16,
+    ):
+        """
+
+        Parameters
+        ----------
+        qlib_dir : str
+            qlib dir
+        csv_path : str
+            origin csv path
+        check_fields : str, optional
+            check fields, by default None, check qlib_dir/features/<first_dir>/*.<freq>.bin
+        freq : str, optional
+            freq, value from ["day", "1m"]
+        symbol_field_name: str, optional
+            symbol field name, by default "symbol"
+        date_field_name: str, optional
+            date field name, by default "date"
+        file_suffix: str, optional
+            csv file suffix, by default ".csv"
+        max_workers: int, optional
+            max workers, by default 16
+        """
+        self.qlib_dir = Path(qlib_dir).expanduser()
+        bin_path_list = list(self.qlib_dir.joinpath("features").iterdir())
+        self.qlib_symbols = sorted(map(lambda x: x.name.lower(), bin_path_list))
+        qlib.init(
+            provider_uri=str(self.qlib_dir.resolve()),
+            mount_path=str(self.qlib_dir.resolve()),
+            auto_mount=False,
+            redis_port=-1,
+        )
+        csv_path = Path(csv_path).expanduser()
+        self.csv_files = sorted(csv_path.glob(f"*{file_suffix}") if csv_path.is_dir() else [csv_path])
+
+        if check_fields is None:
+            check_fields = list(map(lambda x: x.name.split(".")[0], bin_path_list[0].glob(f"*.bin")))
+        else:
+            check_fields = check_fields.split(",") if isinstance(check_fields, str) else check_fields
+        self.check_fields = list(map(lambda x: x.strip(), check_fields))
+        self.qlib_fields = list(map(lambda x: f"${x}", self.check_fields))
+        self.max_workers = max_workers
+        self.symbol_field_name = symbol_field_name
+        self.date_field_name = date_field_name
+        self.freq = freq
+        self.file_suffix = file_suffix
+
+    def _compare(self, file_path: Path):
+        symbol = file_path.name.strip(self.file_suffix)
+        if symbol.lower() not in self.qlib_symbols:
+            return self.NOT_IN_FEATURES
+        # qlib data
+        qlib_df = D.features([symbol], self.qlib_fields, freq=self.freq)
+        qlib_df.rename(columns={_c: _c.strip("$") for _c in qlib_df.columns}, inplace=True)
+        # csv data
+        origin_df = pd.read_csv(file_path)
+        origin_df[self.date_field_name] = pd.to_datetime(origin_df[self.date_field_name])
+        if self.symbol_field_name not in origin_df.columns:
+            origin_df[self.symbol_field_name] = symbol
+        origin_df.set_index([self.symbol_field_name, self.date_field_name], inplace=True)
+        origin_df.index.names = qlib_df.index.names
+        origin_df = origin_df.reindex(qlib_df.index)
+        try:
+            compare = datacompy.Compare(
+                origin_df,
+                qlib_df,
+                on_index=True,
+                abs_tol=1e-08,  # Optional, defaults to 0
+                rel_tol=1e-05,  # Optional, defaults to 0
+                df1_name="Original",  # Optional, defaults to 'df1'
+                df2_name="New",  # Optional, defaults to 'df2'
+            )
+            _r = compare.matches(ignore_extra_columns=True)
+            return self.COMPARE_TRUE if _r else self.COMPARE_FALSE
+        except Exception as e:
+            logger.warning(f"{symbol} compare error: {e}")
+            return self.COMPARE_ERROR
+
+    def check(self):
+        """Check whether the bin file after ``dump_bin.py`` is executed is consistent with the original csv file data"""
+        logger.info("start check......")
+
+        error_list = []
+        not_in_features = []
+        compare_false = []
+        with tqdm(total=len(self.csv_files)) as p_bar:
+            with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+                for file_path, _check_res in zip(self.csv_files, executor.map(self._compare, self.csv_files)):
+                    symbol = file_path.name.strip(self.file_suffix)
+                    if _check_res == self.NOT_IN_FEATURES:
+                        not_in_features.append(symbol)
+                    elif _check_res == self.COMPARE_ERROR:
+                        error_list.append(symbol)
+                    elif _check_res == self.COMPARE_FALSE:
+                        compare_false.append(symbol)
+                    p_bar.update()
+
+        logger.info("end of check......")
+        if error_list:
+            logger.warning(f"compare error: {error_list}")
+        if not_in_features:
+            logger.warning(f"not in features: {not_in_features}")
+        if compare_false:
+            logger.warning(f"compare False: {compare_false}")
+        logger.info(
+            f"total {len(self.csv_files)}, {len(error_list)} errors, {len(not_in_features)} not in features, {len(compare_false)} compare false"
+        )
+
+
+if __name__ == "__main__":
+    fire.Fire(CheckBin)
--- a/src/qlib_scripts/collect_info.py
+++ b/src/qlib_scripts/collect_info.py
@ -0,0 +1,71 @@
+import sys
+import platform
+import qlib
+import fire
+import pkg_resources
+from pathlib import Path
+
+QLIB_PATH = Path(__file__).absolute().resolve().parent.parent
+
+
+class InfoCollector:
+    """
+    User could collect system info by following commands
+    `cd scripts && python collect_info.py all`
+    - NOTE: please avoid running this script in the project folder which contains `qlib`
+    """
+
+    def sys(self):
+        """collect system related info"""
+        for method in ["system", "machine", "platform", "version"]:
+            print(getattr(platform, method)())
+
+    def py(self):
+        """collect Python related info"""
+        print("Python version: {}".format(sys.version.replace("\n", " ")))
+
+    def qlib(self):
+        """collect qlib related info"""
+        print("Qlib version: {}".format(qlib.__version__))
+        REQUIRED = [
+            "numpy",
+            "pandas",
+            "scipy",
+            "requests",
+            "sacred",
+            "python-socketio",
+            "redis",
+            "python-redis-lock",
+            "schedule",
+            "cvxpy",
+            "hyperopt",
+            "fire",
+            "statsmodels",
+            "xlrd",
+            "plotly",
+            "matplotlib",
+            "tables",
+            "pyyaml",
+            "mlflow",
+            "tqdm",
+            "loguru",
+            "lightgbm",
+            "tornado",
+            "joblib",
+            "fire",
+            "ruamel.yaml",
+        ]
+
+        for package in REQUIRED:
+            version = pkg_resources.get_distribution(package).version
+            print(f"{package}=={version}")
+
+    def all(self):
+        """collect all info"""
+        for method in ["sys", "py", "qlib"]:
+            getattr(self, method)()
+            print()
+
+
+if __name__ == "__main__":
+    fire.Fire(InfoCollector)
--- a/src/qlib_scripts/data_collector/README.md
+++ b/src/qlib_scripts/data_collector/README.md
@ -0,0 +1,60 @@
+# Data Collector
+
+## Introduction
+
+Scripts for data collection
+
+- yahoo: get *US/CN* stock data from *Yahoo Finance*
+- fund: get fund data from *http://fund.eastmoney.com*
+- cn_index: get *CN index* from *http://www.csindex.com.cn*, *CSI300*/*CSI100*
+- us_index: get *US index* from *https://en.wikipedia.org/wiki*, *SP500*/*NASDAQ100*/*DJIA*/*SP400*
+- contrib: scripts for some auxiliary functions
+
+
+## Custom Data Collection
+
+> Specific implementation reference: https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo
+
+1. Create a dataset code directory in the current directory
+2. Add `collector.py`
+   - add collector class:
+     ```python
+     CUR_DIR = Path(__file__).resolve().parent
+     sys.path.append(str(CUR_DIR.parent.parent))
+     from data_collector.base import BaseCollector, BaseNormalize, BaseRun
+     class UserCollector(BaseCollector):
+         ...
+     ```
+   - add normalize class:
+     ```python
+     class UserNormalzie(BaseNormalize):
+         ...
+     ```
+   - add `CLI` class:
+     ```python
+     class Run(BaseRun):
+         ...
+     ```
+3. add `README.md`
+4. add `requirements.txt`
+
+
+## Description of dataset
+
+  |             | Basic data                                                                                                       |
+  |------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------|
+  | Features    | **Price/Volume**: <br>&nbsp;&nbsp; - $close/$open/$low/$high/$volume/$change/$factor                             |
+  | Calendar    | **\<freq>.txt**: <br>&nbsp;&nbsp; - day.txt<br>&nbsp;&nbsp;  - 1min.txt                                          |
+  | Instruments | **\<market>.txt**: <br>&nbsp;&nbsp; - required: **all.txt**; <br>&nbsp;&nbsp;  - csi300.txt/csi500.txt/sp500.txt |
+
+  - `Features`: data, **digital**
+    - if not **adjusted**, **factor=1**
+
+### Data-dependent component
+
+> To make the component running correctly, the dependent data are required
+
+  | Component      | required data                                     |
+  |---------------------------------------------------|--------------------------------|
+  | Data retrieval | Features, Calendar, Instrument                    |
+  | Backtest       | **Features[Price/Volume]**, Calendar, Instruments |
--- a/src/qlib_scripts/data_collector/base.py
+++ b/src/qlib_scripts/data_collector/base.py
@ -0,0 +1,427 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+
+import abc
+import time
+import datetime
+import importlib
+from pathlib import Path
+from typing import Type, Iterable
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+
+import pandas as pd
+from tqdm import tqdm
+from loguru import logger
+from joblib import Parallel, delayed
+from qlib.utils import code_to_fname
+
+
+class BaseCollector(abc.ABC):
+
+    CACHE_FLAG = "CACHED"
+    NORMAL_FLAG = "NORMAL"
+
+    DEFAULT_START_DATETIME_1D = pd.Timestamp("2000-01-01")
+    DEFAULT_START_DATETIME_1MIN = pd.Timestamp(datetime.datetime.now() - pd.Timedelta(days=5 * 6 - 1)).date()
+    DEFAULT_END_DATETIME_1D = pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1)).date()
+    DEFAULT_END_DATETIME_1MIN = DEFAULT_END_DATETIME_1D
+
+    INTERVAL_1min = "1min"
+    INTERVAL_1d = "1d"
+
+    def __init__(
+        self,
+        save_dir: [str, Path],
+        start=None,
+        end=None,
+        interval="1d",
+        max_workers=1,
+        max_collector_count=2,
+        delay=0,
+        check_data_length: int = None,
+        limit_nums: int = None,
+    ):
+        """
+
+        Parameters
+        ----------
+        save_dir: str
+            instrument save dir
+        max_workers: int
+            workers, default 1; Concurrent number, default is 1; when collecting data, it is recommended that max_workers be set to 1
+        max_collector_count: int
+            default 2
+        delay: float
+            time.sleep(delay), default 0
+        interval: str
+            freq, value from [1min, 1d], default 1d
+        start: str
+            start datetime, default None
+        end: str
+            end datetime, default None
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
+        limit_nums: int
+            using for debug, by default None
+        """
+        self.save_dir = Path(save_dir).expanduser().resolve()
+        self.save_dir.mkdir(parents=True, exist_ok=True)
+
+        self.delay = delay
+        self.max_workers = max_workers
+        self.max_collector_count = max_collector_count
+        self.mini_symbol_map = {}
+        self.interval = interval
+        self.check_data_length = max(int(check_data_length) if check_data_length is not None else 0, 0)
+
+        self.start_datetime = self.normalize_start_datetime(start)
+        self.end_datetime = self.normalize_end_datetime(end)
+
+        self.instrument_list = sorted(set(self.get_instrument_list()))
+
+        if limit_nums is not None:
+            try:
+                self.instrument_list = self.instrument_list[: int(limit_nums)]
+            except Exception as e:
+                logger.warning(f"Cannot use limit_nums={limit_nums}, the parameter will be ignored")
+
+    def normalize_start_datetime(self, start_datetime: [str, pd.Timestamp] = None):
+        return (
+            pd.Timestamp(str(start_datetime))
+            if start_datetime
+            else getattr(self, f"DEFAULT_START_DATETIME_{self.interval.upper()}")
+        )
+
+    def normalize_end_datetime(self, end_datetime: [str, pd.Timestamp] = None):
+        return (
+            pd.Timestamp(str(end_datetime))
+            if end_datetime
+            else getattr(self, f"DEFAULT_END_DATETIME_{self.interval.upper()}")
+        )
+
+    @abc.abstractmethod
+    def get_instrument_list(self):
+        raise NotImplementedError("rewrite get_instrument_list")
+
+    @abc.abstractmethod
+    def normalize_symbol(self, symbol: str):
+        """normalize symbol"""
+        raise NotImplementedError("rewrite normalize_symbol")
+
+    @abc.abstractmethod
+    def get_data(
+        self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp
+    ) -> pd.DataFrame:
+        """get data with symbol
+
+        Parameters
+        ----------
+        symbol: str
+        interval: str
+            value from [1min, 1d]
+        start_datetime: pd.Timestamp
+        end_datetime: pd.Timestamp
+
+        Returns
+        ---------
+            pd.DataFrame, "symbol" and "date"in pd.columns
+
+        """
+        raise NotImplementedError("rewrite get_timezone")
+
+    def sleep(self):
+        time.sleep(self.delay)
+
+    def _simple_collector(self, symbol: str):
+        """
+
+        Parameters
+        ----------
+        symbol: str
+
+        """
+        self.sleep()
+        df = self.get_data(symbol, self.interval, self.start_datetime, self.end_datetime)
+        _result = self.NORMAL_FLAG
+        if self.check_data_length > 0:
+            _result = self.cache_small_data(symbol, df)
+        if _result == self.NORMAL_FLAG:
+            self.save_instrument(symbol, df)
+        return _result
+
+    def save_instrument(self, symbol, df: pd.DataFrame):
+        """save instrument data to file
+
+        Parameters
+        ----------
+        symbol: str
+            instrument code
+        df : pd.DataFrame
+            df.columns must contain "symbol" and "datetime"
+        """
+        if df is None or df.empty:
+            logger.warning(f"{symbol} is empty")
+            return
+
+        symbol = self.normalize_symbol(symbol)
+        symbol = code_to_fname(symbol)
+        instrument_path = self.save_dir.joinpath(f"{symbol}.csv")
+        df["symbol"] = symbol
+        if instrument_path.exists():
+            _old_df = pd.read_csv(instrument_path)
+            df = pd.concat([_old_df, df], sort=False)
+        df.to_csv(instrument_path, index=False)
+
+    def cache_small_data(self, symbol, df):
+        if len(df) < self.check_data_length:
+            logger.warning(f"the number of trading days of {symbol} is less than {self.check_data_length}!")
+            _temp = self.mini_symbol_map.setdefault(symbol, [])
+            _temp.append(df.copy())
+            return self.CACHE_FLAG
+        else:
+            if symbol in self.mini_symbol_map:
+                self.mini_symbol_map.pop(symbol)
+            return self.NORMAL_FLAG
+
+    def _collector(self, instrument_list):
+
+        error_symbol = []
+        res = Parallel(n_jobs=self.max_workers)(
+            delayed(self._simple_collector)(_inst) for _inst in tqdm(instrument_list)
+        )
+        for _symbol, _result in zip(instrument_list, res):
+            if _result != self.NORMAL_FLAG:
+                error_symbol.append(_symbol)
+        print(error_symbol)
+        logger.info(f"error symbol nums: {len(error_symbol)}")
+        logger.info(f"current get symbol nums: {len(instrument_list)}")
+        error_symbol.extend(self.mini_symbol_map.keys())
+        return sorted(set(error_symbol))
+
+    def collector_data(self):
+        """collector data"""
+        logger.info("start collector data......")
+        instrument_list = self.instrument_list
+        for i in range(self.max_collector_count):
+            if not instrument_list:
+                break
+            logger.info(f"getting data: {i+1}")
+            instrument_list = self._collector(instrument_list)
+            logger.info(f"{i+1} finish.")
+        for _symbol, _df_list in self.mini_symbol_map.items():
+            _df = pd.concat(_df_list, sort=False)
+            if not _df.empty:
+                self.save_instrument(_symbol, _df.drop_duplicates(["date"]).sort_values(["date"]))
+        if self.mini_symbol_map:
+            logger.warning(f"less than {self.check_data_length} instrument list: {list(self.mini_symbol_map.keys())}")
+        logger.info(f"total {len(self.instrument_list)}, error: {len(set(instrument_list))}")
+
+
+class BaseNormalize(abc.ABC):
+    def __init__(self, date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs):
+        """
+
+        Parameters
+        ----------
+        date_field_name: str
+            date field name, default is date
+        symbol_field_name: str
+            symbol field name, default is symbol
+        """
+        self._date_field_name = date_field_name
+        self._symbol_field_name = symbol_field_name
+        self.kwargs = kwargs
+        self._calendar_list = self._get_calendar_list()
+
+    @abc.abstractmethod
+    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
+        # normalize
+        raise NotImplementedError("")
+
+    @abc.abstractmethod
+    def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
+        """Get benchmark calendar"""
+        raise NotImplementedError("")
+
+
+class Normalize:
+    def __init__(
+        self,
+        source_dir: [str, Path],
+        target_dir: [str, Path],
+        normalize_class: Type[BaseNormalize],
+        max_workers: int = 16,
+        date_field_name: str = "date",
+        symbol_field_name: str = "symbol",
+        **kwargs,
+    ):
+        """
+
+        Parameters
+        ----------
+        source_dir: str or Path
+            The directory where the raw data collected from the Internet is saved
+        target_dir: str or Path
+            Directory for normalize data
+        normalize_class: Type[YahooNormalize]
+            normalize class
+        max_workers: int
+            Concurrent number, default is 16
+        date_field_name: str
+            date field name, default is date
+        symbol_field_name: str
+            symbol field name, default is symbol
+        """
+        if not (source_dir and target_dir):
+            raise ValueError("source_dir and target_dir cannot be None")
+        self._source_dir = Path(source_dir).expanduser()
+        self._target_dir = Path(target_dir).expanduser()
+        self._target_dir.mkdir(parents=True, exist_ok=True)
+        self._date_field_name = date_field_name
+        self._symbol_field_name = symbol_field_name
+        self._end_date = kwargs.get("end_date", None)
+        self._max_workers = max_workers
+
+        self._normalize_obj = normalize_class(
+            date_field_name=date_field_name, symbol_field_name=symbol_field_name, **kwargs
+        )
+
+    def _executor(self, file_path: Path):
+        file_path = Path(file_path)
+        df = pd.read_csv(file_path)
+        df = self._normalize_obj.normalize(df)
+        if df is not None and not df.empty:
+            if self._end_date is not None:
+                _mask = pd.to_datetime(df[self._date_field_name]) <= pd.Timestamp(self._end_date)
+                df = df[_mask]
+            df.to_csv(self._target_dir.joinpath(file_path.name), index=False)
+
+    def normalize(self):
+        logger.info("normalize data......")
+
+        with ProcessPoolExecutor(max_workers=self._max_workers) as worker:
+            file_list = list(self._source_dir.glob("*.csv"))
+            with tqdm(total=len(file_list)) as p_bar:
+                for _ in worker.map(self._executor, file_list):
+                    p_bar.update()
+
+
+class BaseRun(abc.ABC):
+    def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d"):
+        """
+
+        Parameters
+        ----------
+        source_dir: str
+            The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
+        normalize_dir: str
+            Directory for normalize data, default "Path(__file__).parent/normalize"
+        max_workers: int
+            Concurrent number, default is 1; Concurrent number, default is 1; when collecting data, it is recommended that max_workers be set to 1
+        interval: str
+            freq, value from [1min, 1d], default 1d
+        """
+        if source_dir is None:
+            source_dir = Path(self.default_base_dir).joinpath("source")
+        self.source_dir = Path(source_dir).expanduser().resolve()
+        self.source_dir.mkdir(parents=True, exist_ok=True)
+
+        if normalize_dir is None:
+            normalize_dir = Path(self.default_base_dir).joinpath("normalize")
+        self.normalize_dir = Path(normalize_dir).expanduser().resolve()
+        self.normalize_dir.mkdir(parents=True, exist_ok=True)
+
+        self._cur_module = importlib.import_module("collector")
+        self.max_workers = max_workers
+        self.interval = interval
+
+    @property
+    @abc.abstractmethod
+    def collector_class_name(self):
+        raise NotImplementedError("rewrite collector_class_name")
+
+    @property
+    @abc.abstractmethod
+    def normalize_class_name(self):
+        raise NotImplementedError("rewrite normalize_class_name")
+
+    @property
+    @abc.abstractmethod
+    def default_base_dir(self) -> [Path, str]:
+        raise NotImplementedError("rewrite default_base_dir")
+
+    def download_data(
+        self,
+        max_collector_count=2,
+        delay=0,
+        start=None,
+        end=None,
+        check_data_length: int = None,
+        limit_nums=None,
+        **kwargs,
+    ):
+        """download data from Internet
+
+        Parameters
+        ----------
+        max_collector_count: int
+            default 2
+        delay: float
+            time.sleep(delay), default 0
+        start: str
+            start datetime, default "2000-01-01"
+        end: str
+            end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))``
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
+        limit_nums: int
+            using for debug, by default None
+
+        Examples
+        ---------
+            # get daily data
+            $ python collector.py download_data --source_dir ~/.qlib/instrument_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
+            # get 1m data
+            $ python collector.py download_data --source_dir ~/.qlib/instrument_data/source --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1m
+        """
+
+        _class = getattr(self._cur_module, self.collector_class_name)  # type: Type[BaseCollector]
+        _class(
+            self.source_dir,
+            max_workers=self.max_workers,
+            max_collector_count=max_collector_count,
+            delay=delay,
+            start=start,
+            end=end,
+            interval=self.interval,
+            check_data_length=check_data_length,
+            limit_nums=limit_nums,
+            **kwargs,
+        ).collector_data()
+
+    def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol", **kwargs):
+        """normalize data
+
+        Parameters
+        ----------
+        date_field_name: str
+            date field name, default date
+        symbol_field_name: str
+            symbol field name, default symbol
+
+        Examples
+        ---------
+            $ python collector.py normalize_data --source_dir ~/.qlib/instrument_data/source --normalize_dir ~/.qlib/instrument_data/normalize --region CN --interval 1d
+        """
+        _class = getattr(self._cur_module, self.normalize_class_name)
+        yc = Normalize(
+            source_dir=self.source_dir,
+            target_dir=self.normalize_dir,
+            normalize_class=_class,
+            max_workers=self.max_workers,
+            date_field_name=date_field_name,
+            symbol_field_name=symbol_field_name,
+            **kwargs,
+        )
+        yc.normalize()
--- a/src/qlib_scripts/data_collector/br_index/README.md
+++ b/src/qlib_scripts/data_collector/br_index/README.md
@ -0,0 +1,61 @@
+# iBOVESPA History Companies Collection
+
+## Requirements
+
+- Install the libs from the file `requirements.txt`
+
+    ```bash
+    pip install -r requirements.txt
+    ```
+- `requirements.txt` file was generated using python3.8
+
+## For the ibovespa (IBOV) index, we have:
+
+<hr/>
+
+### Method `get_new_companies`
+
+#### <b>Index start date</b>
+
+- The ibovespa index started on 2 January 1968 ([wiki](https://en.wikipedia.org/wiki/%C3%8Dndice_Bovespa)).  In order to use this start date in our `bench_start_date(self)` method, two conditions must be satisfied:
+    1) APIs used to download brazilian stocks (B3) historical prices must keep track of such historic data since 2 January 1968
+
+    2) Some website or API must provide, from that date, the historic index composition. In other words, the companies used to build the index .
+
+    As a consequence, the method `bench_start_date(self)` inside `collector.py` was implemented using `pd.Timestamp("2003-01-03")` due to two reasons
+
+    1) The earliest ibov composition that have been found was from the first quarter of 2003. More informations about such composition can be seen on the sections below.
+
+    2) Yahoo finance, one of the libraries used to download symbols historic prices, keeps track from this date forward.
+
+- Within the `get_new_companies` method, a logic was implemented to get, for each ibovespa component stock, the start date that yahoo finance keeps track of.
+
+#### <b>Code Logic</b>
+
+The code does a web scrapping into the B3's [website](https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBOV?language=pt-br), which keeps track of the ibovespa stocks composition on the current day. 
+
+Other approaches, such as `request` and `Beautiful Soup` could have been used. However, the website shows the table with the stocks with some delay, since it uses a script inside of it to obtain such compositions.
+Alternatively, `selenium` was used to download this stocks' composition in order to overcome this problem.
+
+Futhermore, the data downloaded from the selenium script  was preprocessed so it could be saved into the `csv` format stablished by `scripts/data_collector/index.py`.
+
+<hr/>
+
+### Method `get_changes` 
+
+No suitable data source that keeps track of ibovespa's history stocks composition has been found. Except from this [repository](https://github.com/igor17400/IBOV-HCI) which provide such information have been used, however it only provides the data from the 1st quarter of 2003 to 3rd quarter of 2021.
+
+With that reference, the index's composition can be compared quarter by quarter and year by year and then generate a file that keeps track of which stocks have been removed and which have been added each quarter and year.
+
+<hr/>
+
+### Collector Data
+
+```bash
+# parse instruments, using in qlib/instruments.
+python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method parse_instruments
+
+# parse new companies
+python collector.py --index_name IBOV --qlib_dir ~/.qlib/qlib_data/br_data --method save_new_companies
+```
+
--- a/src/qlib_scripts/data_collector/br_index/collector.py
+++ b/src/qlib_scripts/data_collector/br_index/collector.py
@ -0,0 +1,287 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from functools import partial
+import sys
+from pathlib import Path
+import importlib
+import datetime
+
+import fire
+import pandas as pd
+from tqdm import tqdm
+from loguru import logger
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent))
+
+from data_collector.index import IndexBase
+from data_collector.utils import get_instruments
+
+quarter_dict = {"1Q": "01-03", "2Q": "05-01", "3Q": "09-01"}
+
+
+class IBOVIndex(IndexBase):
+
+    ibov_index_composition = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv"
+    years_4_month_periods = []
+
+    def __init__(
+        self,
+        index_name: str,
+        qlib_dir: [str, Path] = None,
+        freq: str = "day",
+        request_retry: int = 5,
+        retry_sleep: int = 3,
+    ):
+        super(IBOVIndex, self).__init__(
+            index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
+        )
+
+        self.today: datetime = datetime.date.today()
+        self.current_4_month_period = self.get_current_4_month_period(self.today.month)
+        self.year = str(self.today.year)
+        self.years_4_month_periods = self.get_four_month_period()
+
+    @property
+    def bench_start_date(self) -> pd.Timestamp:
+        """
+        The ibovespa index started on 2 January 1968 (wiki), however,
+        no suitable data source that keeps track of ibovespa's history
+        stocks composition has been found. Except from the repo indicated
+        in README. Which keeps track of such information starting from
+        the first quarter of 2003
+        """
+        return pd.Timestamp("2003-01-03")
+
+    def get_current_4_month_period(self, current_month: int):
+        """
+        This function is used to calculated what is the current
+        four month period for the current month. For example,
+        If the current month is August 8, its four month period
+        is 2Q.
+
+        OBS: In english Q is used to represent *quarter*
+        which means a three month period. However, in
+        portuguese we use Q to represent a four month period.
+        In other words,
+
+        Jan, Feb, Mar, Apr: 1Q
+        May, Jun, Jul, Aug: 2Q
+        Sep, Oct, Nov, Dez: 3Q
+
+        Parameters
+        ----------
+        month : int
+            Current month (1 <= month <= 12)
+
+        Returns
+        -------
+        current_4m_period:str
+            Current Four Month Period (1Q or 2Q or 3Q)
+        """
+        if current_month < 5:
+            return "1Q"
+        if current_month < 9:
+            return "2Q"
+        if current_month <= 12:
+            return "3Q"
+        else:
+            return -1
+
+    def get_four_month_period(self):
+        """
+        The ibovespa index is updated every four months.
+        Therefore, we will represent each time period as 2003_1Q
+        which means 2003 first four mount period (Jan, Feb, Mar, Apr)
+        """
+        four_months_period = ["1Q", "2Q", "3Q"]
+        init_year = 2003
+        now = datetime.datetime.now()
+        current_year = now.year
+        current_month = now.month
+        for year in [item for item in range(init_year, current_year)]:
+            for el in four_months_period:
+                self.years_4_month_periods.append(str(year) + "_" + el)
+        # For current year the logic must be a little different
+        current_4_month_period = self.get_current_4_month_period(current_month)
+        for i in range(int(current_4_month_period[0])):
+            self.years_4_month_periods.append(str(current_year) + "_" + str(i + 1) + "Q")
+        return self.years_4_month_periods
+
+    def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
+        """formatting the datetime in an instrument
+
+        Parameters
+        ----------
+        inst_df: pd.DataFrame
+            inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
+
+        Returns
+        -------
+        inst_df: pd.DataFrame
+
+        """
+        logger.info("Formatting Datetime")
+        if self.freq != "day":
+            inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
+                lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S")
+            )
+        else:
+            inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply(
+                lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d")
+            )
+
+            inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
+                lambda x: (pd.Timestamp(x)).strftime("%Y-%m-%d")
+            )
+        return inst_df
+
+    def format_quarter(self, cell: str):
+        """
+        Parameters
+        ----------
+        cell: str
+            It must be on the format 2003_1Q --> years_4_month_periods
+
+        Returns
+        ----------
+        date: str
+            Returns date in format 2003-03-01
+        """
+        cell_split = cell.split("_")
+        return cell_split[0] + "-" + quarter_dict[cell_split[1]]
+
+    def get_changes(self):
+        """
+        Access the index historic composition and compare it quarter
+        by quarter and year by year in order to generate a file that
+        keeps track of which stocks have been removed and which have
+        been added.
+
+        The Dataframe used as reference will provided the index
+        composition for each year an quarter:
+        pd.DataFrame:
+            symbol
+            SH600000
+            SH600001
+            .
+            .
+            .
+
+        Parameters
+        ----------
+        self: is used to represent the instance of the class.
+
+        Returns
+        ----------
+        pd.DataFrame:
+            symbol      date        type
+            SH600000  2019-11-11    add
+            SH600001  2020-11-10    remove
+            dtypes:
+                symbol: str
+                date: pd.Timestamp
+                type: str, value from ["add", "remove"]
+        """
+        logger.info("Getting companies changes in {} index ...".format(self.index_name))
+
+        try:
+            df_changes_list = []
+            for i in tqdm(range(len(self.years_4_month_periods) - 1)):
+                df = pd.read_csv(
+                    self.ibov_index_composition.format(self.years_4_month_periods[i]), on_bad_lines="skip"
+                )["symbol"]
+                df_ = pd.read_csv(
+                    self.ibov_index_composition.format(self.years_4_month_periods[i + 1]), on_bad_lines="skip"
+                )["symbol"]
+
+                ## Remove Dataframe
+                remove_date = (
+                    self.years_4_month_periods[i].split("_")[0]
+                    + "-"
+                    + quarter_dict[self.years_4_month_periods[i].split("_")[1]]
+                )
+                list_remove = list(df[~df.isin(df_)])
+                df_removed = pd.DataFrame(
+                    {
+                        "date": len(list_remove) * [remove_date],
+                        "type": len(list_remove) * ["remove"],
+                        "symbol": list_remove,
+                    }
+                )
+
+                ## Add Dataframe
+                add_date = (
+                    self.years_4_month_periods[i + 1].split("_")[0]
+                    + "-"
+                    + quarter_dict[self.years_4_month_periods[i + 1].split("_")[1]]
+                )
+                list_add = list(df_[~df_.isin(df)])
+                df_added = pd.DataFrame(
+                    {"date": len(list_add) * [add_date], "type": len(list_add) * ["add"], "symbol": list_add}
+                )
+
+                df_changes_list.append(pd.concat([df_added, df_removed], sort=False))
+                df = pd.concat(df_changes_list).reset_index(drop=True)
+                df["symbol"] = df["symbol"].astype(str) + ".SA"
+
+            return df
+
+        except Exception as E:
+            logger.error("An error occured while downloading 2008 index composition - {}".format(E))
+
+    def get_new_companies(self):
+        """
+        Get latest index composition.
+        The repo indicated on README has implemented a script
+        to get the latest index composition from B3 website using
+        selenium. Therefore, this method will download the file
+        containing such composition
+
+        Parameters
+        ----------
+        self: is used to represent the instance of the class.
+
+        Returns
+        ----------
+        pd.DataFrame:
+            symbol      start_date  end_date
+            RRRP3	    2020-11-13	2022-03-02
+            ALPA4	    2008-01-02	2022-03-02
+            dtypes:
+                symbol: str
+                start_date: pd.Timestamp
+                end_date: pd.Timestamp
+        """
+        logger.info("Getting new companies in {} index ...".format(self.index_name))
+
+        try:
+            ## Get index composition
+
+            df_index = pd.read_csv(
+                self.ibov_index_composition.format(self.year + "_" + self.current_4_month_period), on_bad_lines="skip"
+            )
+            df_date_first_added = pd.read_csv(
+                self.ibov_index_composition.format("date_first_added_" + self.year + "_" + self.current_4_month_period),
+                on_bad_lines="skip",
+            )
+            df = df_index.merge(df_date_first_added, on="symbol")[["symbol", "Date First Added"]]
+            df[self.START_DATE_FIELD] = df["Date First Added"].map(self.format_quarter)
+
+            # end_date will be our current quarter + 1, since the IBOV index updates itself every quarter
+            df[self.END_DATE_FIELD] = self.year + "-" + quarter_dict[self.current_4_month_period]
+            df = df[["symbol", self.START_DATE_FIELD, self.END_DATE_FIELD]]
+            df["symbol"] = df["symbol"].astype(str) + ".SA"
+
+            return df
+
+        except Exception as E:
+            logger.error("An error occured while getting new companies - {}".format(E))
+
+    def filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        if "Código" in df.columns:
+            return df.loc[:, ["Código"]].copy()
+
+
+if __name__ == "__main__":
+    fire.Fire(partial(get_instruments, market_index="br_index"))
--- a/src/qlib_scripts/data_collector/br_index/requirements.txt
+++ b/src/qlib_scripts/data_collector/br_index/requirements.txt
@ -0,0 +1,34 @@
+async-generator==1.10
+attrs==21.4.0
+certifi==2021.10.8
+cffi==1.15.0
+charset-normalizer==2.0.12
+cryptography==36.0.1
+fire==0.4.0
+h11==0.13.0
+idna==3.3
+loguru==0.6.0
+lxml==4.8.0
+multitasking==0.0.10
+numpy==1.22.2
+outcome==1.1.0
+pandas==1.4.1
+pycoingecko==2.2.0
+pycparser==2.21
+pyOpenSSL==22.0.0
+PySocks==1.7.1
+python-dateutil==2.8.2
+pytz==2021.3
+requests==2.27.1
+requests-futures==1.0.0
+six==1.16.0
+sniffio==1.2.0
+sortedcontainers==2.4.0
+termcolor==1.1.0
+tqdm==4.63.0
+trio==0.20.0
+trio-websocket==0.9.2
+urllib3==1.26.8
+wget==3.2
+wsproto==1.1.0
+yahooquery==2.2.15
--- a/src/qlib_scripts/data_collector/cn_index/README.md
+++ b/src/qlib_scripts/data_collector/cn_index/README.md
@ -0,0 +1,22 @@
+# CSI300/CSI100/CSI500 History Companies Collection
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Collector Data
+
+```bash
+# parse instruments, using in qlib/instruments.
+python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments
+
+# parse new companies
+python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
+
+# index_name support: CSI300, CSI100, CSI500
+# help
+python collector.py --help
+```
+
--- a/src/qlib_scripts/data_collector/cn_index/collector.py
+++ b/src/qlib_scripts/data_collector/cn_index/collector.py
@ -0,0 +1,468 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import re
+import abc
+import sys
+import datetime
+from io import BytesIO
+from typing import List, Iterable
+from pathlib import Path
+
+import fire
+import requests
+import pandas as pd
+import baostock as bs
+from tqdm import tqdm
+from loguru import logger
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent))
+
+from data_collector.index import IndexBase
+from data_collector.utils import get_calendar_list, get_trading_date_by_shift, deco_retry
+from data_collector.utils import get_instruments
+
+
+NEW_COMPANIES_URL = "https://csi-web-dev.oss-cn-shanghai-finance-1-pub.aliyuncs.com/static/html/csindex/public/uploads/file/autofile/cons/{index_code}cons.xls"
+
+
+INDEX_CHANGES_URL = "https://www.csindex.com.cn/csindex-home/search/search-content?lang=cn&searchInput=%E5%85%B3%E4%BA%8E%E8%B0%83%E6%95%B4%E6%B2%AA%E6%B7%B1300%E5%92%8C%E4%B8%AD%E8%AF%81%E9%A6%99%E6%B8%AF100%E7%AD%89%E6%8C%87%E6%95%B0%E6%A0%B7%E6%9C%AC&pageNum={page_num}&pageSize={page_size}&sortField=date&dateRange=all&contentType=announcement"
+
+REQ_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48"
+}
+
+
+@deco_retry
+def retry_request(url: str, method: str = "get", exclude_status: List = None):
+    if exclude_status is None:
+        exclude_status = []
+    method_func = getattr(requests, method)
+    _resp = method_func(url, headers=REQ_HEADERS)
+    _status = _resp.status_code
+    if _status not in exclude_status and _status != 200:
+        raise ValueError(f"response status: {_status}, url={url}")
+    return _resp
+
+
+class CSIIndex(IndexBase):
+    @property
+    def calendar_list(self) -> List[pd.Timestamp]:
+        """get history trading date
+
+        Returns
+        -------
+            calendar list
+        """
+        _calendar = getattr(self, "_calendar_list", None)
+        if not _calendar:
+            _calendar = get_calendar_list(bench_code=self.index_name.upper())
+            setattr(self, "_calendar_list", _calendar)
+        return _calendar
+
+    @property
+    def new_companies_url(self) -> str:
+        return NEW_COMPANIES_URL.format(index_code=self.index_code)
+
+    @property
+    def changes_url(self) -> str:
+        return INDEX_CHANGES_URL
+
+    @property
+    @abc.abstractmethod
+    def bench_start_date(self) -> pd.Timestamp:
+        """
+        Returns
+        -------
+            index start date
+        """
+        raise NotImplementedError("rewrite bench_start_date")
+
+    @property
+    @abc.abstractmethod
+    def index_code(self) -> str:
+        """
+        Returns
+        -------
+            index code
+        """
+        raise NotImplementedError("rewrite index_code")
+
+    @property
+    def html_table_index(self) -> int:
+        """Which table of changes in html
+
+        CSI300: 0
+        CSI100: 1
+        :return:
+        """
+        raise NotImplementedError("rewrite html_table_index")
+
+    def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
+        """formatting the datetime in an instrument
+
+        Parameters
+        ----------
+        inst_df: pd.DataFrame
+            inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
+
+        Returns
+        -------
+
+        """
+        if self.freq != "day":
+            inst_df[self.START_DATE_FIELD] = inst_df[self.START_DATE_FIELD].apply(
+                lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=9, minutes=30)).strftime("%Y-%m-%d %H:%M:%S")
+            )
+            inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
+                lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=15, minutes=0)).strftime("%Y-%m-%d %H:%M:%S")
+            )
+        return inst_df
+
+    def get_changes(self) -> pd.DataFrame:
+        """get companies changes
+
+        Returns
+        -------
+            pd.DataFrame:
+                symbol      date        type
+                SH600000  2019-11-11    add
+                SH600000  2020-11-10    remove
+            dtypes:
+                symbol: str
+                date: pd.Timestamp
+                type: str, value from ["add", "remove"]
+        """
+        logger.info("get companies changes......")
+        res = []
+        for _url in self._get_change_notices_url():
+            _df = self._read_change_from_url(_url)
+            if not _df.empty:
+                res.append(_df)
+        logger.info("get companies changes finish")
+        return pd.concat(res, sort=False)
+
+    @staticmethod
+    def normalize_symbol(symbol: str) -> str:
+        """
+
+        Parameters
+        ----------
+        symbol: str
+            symbol
+
+        Returns
+        -------
+            symbol
+        """
+        symbol = f"{int(symbol):06}"
+        return f"SH{symbol}" if symbol.startswith("60") or symbol.startswith("688") else f"SZ{symbol}"
+
+    def _parse_excel(self, excel_url: str, add_date: pd.Timestamp, remove_date: pd.Timestamp) -> pd.DataFrame:
+        content = retry_request(excel_url, exclude_status=[404]).content
+        _io = BytesIO(content)
+        df_map = pd.read_excel(_io, sheet_name=None)
+        with self.cache_dir.joinpath(
+            f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.{excel_url.split('.')[-1]}"
+        ).open("wb") as fp:
+            fp.write(content)
+        tmp = []
+        for _s_name, _type, _date in [("调入", self.ADD, add_date), ("调出", self.REMOVE, remove_date)]:
+            _df = df_map[_s_name]
+            _df = _df.loc[_df["指数代码"] == self.index_code, ["证券代码"]]
+            _df = _df.applymap(self.normalize_symbol)
+            _df.columns = [self.SYMBOL_FIELD_NAME]
+            _df["type"] = _type
+            _df[self.DATE_FIELD_NAME] = _date
+            tmp.append(_df)
+        df = pd.concat(tmp)
+        return df
+
+    def _parse_table(self, content: str, add_date: pd.DataFrame, remove_date: pd.DataFrame) -> pd.DataFrame:
+        df = pd.DataFrame()
+        _tmp_count = 0
+        for _df in pd.read_html(content):
+            if _df.shape[-1] != 4 or _df.isnull().loc(0)[0][0]:
+                continue
+            _tmp_count += 1
+            if self.html_table_index + 1 > _tmp_count:
+                continue
+            tmp = []
+            for _s, _type, _date in [
+                (_df.iloc[2:, 0], self.REMOVE, remove_date),
+                (_df.iloc[2:, 2], self.ADD, add_date),
+            ]:
+                _tmp_df = pd.DataFrame()
+                _tmp_df[self.SYMBOL_FIELD_NAME] = _s.map(self.normalize_symbol)
+                _tmp_df["type"] = _type
+                _tmp_df[self.DATE_FIELD_NAME] = _date
+                tmp.append(_tmp_df)
+            df = pd.concat(tmp)
+            df.to_csv(
+                str(
+                    self.cache_dir.joinpath(
+                        f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.csv"
+                    ).resolve()
+                )
+            )
+            break
+        return df
+
+    def _read_change_from_url(self, url: str) -> pd.DataFrame:
+        """read change from url
+        The parameter url is from the _get_change_notices_url method.
+        Determine the stock add_date/remove_date based on the title.
+        The response contains three cases:
+            1.Only excel_url(extract data from excel_url)
+            2.Both the excel_url and the body text(try to extract data from excel_url first, and then try to extract data from body text)
+            3.Only body text(extract data from body text)
+
+        Parameters
+        ----------
+        url : str
+            change url
+
+        Returns
+        -------
+            pd.DataFrame:
+                symbol      date        type
+                SH600000  2019-11-11    add
+                SH600000  2020-11-10    remove
+            dtypes:
+                symbol: str
+                date: pd.Timestamp
+                type: str, value from ["add", "remove"]
+        """
+        resp = retry_request(url).json()["data"]
+        title = resp["title"]
+        if not title.startswith("关于"):
+            return pd.DataFrame()
+        if "沪深300" not in title:
+            return pd.DataFrame()
+
+        logger.info(f"load index data from https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}")
+        _text = resp["content"]
+        date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text)
+        if len(date_list) >= 2:
+            add_date = pd.Timestamp("-".join(date_list[0]))
+        else:
+            _date = pd.Timestamp("-".join(re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0]))
+            add_date = get_trading_date_by_shift(self.calendar_list, _date, shift=0)
+        if "盘后" in _text or "市后" in _text:
+            add_date = get_trading_date_by_shift(self.calendar_list, add_date, shift=1)
+        remove_date = get_trading_date_by_shift(self.calendar_list, add_date, shift=-1)
+
+        excel_url = None
+        if resp.get("enclosureList", []):
+            excel_url = resp["enclosureList"][0]["fileUrl"]
+        else:
+            excel_url_list = re.findall('.*href="(.*?xls.*?)".*', _text)
+            if excel_url_list:
+                excel_url = excel_url_list[0]
+                if not excel_url.startswith("http"):
+                    excel_url = excel_url if excel_url.startswith("/") else "/" + excel_url
+                    excel_url = f"http://www.csindex.com.cn{excel_url}"
+        if excel_url:
+            try:
+                logger.info(f"get {add_date} changes from the excel, title={title}, excel_url={excel_url}")
+                df = self._parse_excel(excel_url, add_date, remove_date)
+            except ValueError:
+                logger.info(
+                    f"get {add_date} changes from the web page, title={title}, url=https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}"
+                )
+                df = self._parse_table(_text, add_date, remove_date)
+        else:
+            logger.info(
+                f"get {add_date} changes from the web page, title={title}, url=https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}"
+            )
+            df = self._parse_table(_text, add_date, remove_date)
+        return df
+
+    def _get_change_notices_url(self) -> Iterable[str]:
+        """get change notices url
+
+        Returns
+        -------
+            [url1, url2]
+        """
+        page_num = 1
+        page_size = 5
+        data = retry_request(self.changes_url.format(page_size=page_size, page_num=page_num)).json()
+        data = retry_request(self.changes_url.format(page_size=data["total"], page_num=page_num)).json()
+        for item in data["data"]:
+            yield f"https://www.csindex.com.cn/csindex-home/announcement/queryAnnouncementById?id={item['id']}"
+
+    def get_new_companies(self) -> pd.DataFrame:
+        """
+
+        Returns
+        -------
+            pd.DataFrame:
+
+                symbol     start_date    end_date
+                SH600000   2000-01-01    2099-12-31
+
+            dtypes:
+                symbol: str
+                start_date: pd.Timestamp
+                end_date: pd.Timestamp
+        """
+        logger.info("get new companies......")
+        context = retry_request(self.new_companies_url).content
+        with self.cache_dir.joinpath(
+            f"{self.index_name.lower()}_new_companies.{self.new_companies_url.split('.')[-1]}"
+        ).open("wb") as fp:
+            fp.write(context)
+        _io = BytesIO(context)
+        df = pd.read_excel(_io)
+        df = df.iloc[:, [0, 4]]
+        df.columns = [self.END_DATE_FIELD, self.SYMBOL_FIELD_NAME]
+        df[self.SYMBOL_FIELD_NAME] = df[self.SYMBOL_FIELD_NAME].map(self.normalize_symbol)
+        df[self.END_DATE_FIELD] = pd.to_datetime(df[self.END_DATE_FIELD].astype(str))
+        df[self.START_DATE_FIELD] = self.bench_start_date
+        logger.info("end of get new companies.")
+        return df
+
+
+class CSI300Index(CSIIndex):
+    @property
+    def index_code(self):
+        return "000300"
+
+    @property
+    def bench_start_date(self) -> pd.Timestamp:
+        return pd.Timestamp("2005-01-01")
+
+    @property
+    def html_table_index(self) -> int:
+        return 0
+
+
+class CSI100Index(CSIIndex):
+    @property
+    def index_code(self):
+        return "000903"
+
+    @property
+    def bench_start_date(self) -> pd.Timestamp:
+        return pd.Timestamp("2006-05-29")
+
+    @property
+    def html_table_index(self) -> int:
+        return 1
+
+
+class CSI500Index(CSIIndex):
+    @property
+    def index_code(self) -> str:
+        return "000905"
+
+    @property
+    def bench_start_date(self) -> pd.Timestamp:
+        return pd.Timestamp("2007-01-15")
+
+    def get_changes(self) -> pd.DataFrame:
+        """get companies changes
+
+        Return
+        --------
+           pd.DataFrame:
+               symbol      date        type
+               SH600000  2019-11-11    add
+               SH600000  2020-11-10    remove
+           dtypes:
+               symbol: str
+               date: pd.Timestamp
+               type: str, value from ["add", "remove"]
+        """
+        return self.get_changes_with_history_companies(self.get_history_companies())
+
+    def get_history_companies(self) -> pd.DataFrame:
+        """
+
+        Returns
+        -------
+
+            pd.DataFrame:
+                symbol      date        type
+                SH600000  2019-11-11    add
+                SH600000  2020-11-10    remove
+            dtypes:
+                symbol: str
+                date: pd.Timestamp
+                type: str, value from ["add", "remove"]
+        """
+        bs.login()
+        today = pd.datetime.now()
+        date_range = pd.DataFrame(pd.date_range(start="2007-01-15", end=today, freq="7D"))[0].dt.date
+        ret_list = []
+        col = ["date", "symbol", "code_name"]
+        for date in tqdm(date_range, desc="Download CSI500"):
+            rs = bs.query_zz500_stocks(date=str(date))
+            zz500_stocks = []
+            while (rs.error_code == "0") & rs.next():
+                zz500_stocks.append(rs.get_row_data())
+            result = pd.DataFrame(zz500_stocks, columns=col)
+            result["symbol"] = result["symbol"].apply(lambda x: x.replace(".", "").upper())
+            result = self.get_data_from_baostock(date)
+            ret_list.append(result[["date", "symbol"]])
+        bs.logout()
+        return pd.concat(ret_list, sort=False)
+
+    def get_data_from_baostock(self, date) -> pd.DataFrame:
+        """
+        Data source: http://baostock.com/baostock/index.php/%E4%B8%AD%E8%AF%81500%E6%88%90%E5%88%86%E8%82%A1
+        Avoid a large number of parallel data acquisition,
+        such as 1000 times of concurrent data acquisition, because IP will be blocked
+
+        Returns
+        -------
+            pd.DataFrame:
+                date      symbol        code_name
+                SH600039  2007-01-15    四川路桥
+                SH600051  2020-01-15    宁波联合
+            dtypes:
+                date: pd.Timestamp
+                symbol: str
+                code_name: str
+        """
+        col = ["date", "symbol", "code_name"]
+        rs = bs.query_zz500_stocks(date=str(date))
+        zz500_stocks = []
+        while (rs.error_code == "0") & rs.next():
+            zz500_stocks.append(rs.get_row_data())
+        result = pd.DataFrame(zz500_stocks, columns=col)
+        result["symbol"] = result["symbol"].apply(lambda x: x.replace(".", "").upper())
+        return result
+
+    def get_new_companies(self) -> pd.DataFrame:
+        """
+
+        Returns
+        -------
+            pd.DataFrame:
+
+                symbol     start_date    end_date
+                SH600000   2000-01-01    2099-12-31
+
+            dtypes:
+                symbol: str
+                start_date: pd.Timestamp
+                end_date: pd.Timestamp
+        """
+        logger.info("get new companies......")
+        today = datetime.date.today()
+        bs.login()
+        result = self.get_data_from_baostock(today)
+        bs.logout()
+        df = result[["date", "symbol"]]
+        df.columns = [self.END_DATE_FIELD, self.SYMBOL_FIELD_NAME]
+        df[self.END_DATE_FIELD] = pd.to_datetime(df[self.END_DATE_FIELD].astype(str))
+        df[self.START_DATE_FIELD] = self.bench_start_date
+        logger.info("end of get new companies.")
+        return df
+
+
+if __name__ == "__main__":
+    fire.Fire(get_instruments)
--- a/src/qlib_scripts/data_collector/cn_index/requirements.txt
+++ b/src/qlib_scripts/data_collector/cn_index/requirements.txt
@ -0,0 +1,8 @@
+baostock
+logure
+fire
+requests
+pandas
+lxml
+loguru
+tqdm
--- a/src/qlib_scripts/data_collector/contrib/fill_cn_1min_data/README.md
+++ b/src/qlib_scripts/data_collector/contrib/fill_cn_1min_data/README.md
@ -0,0 +1,23 @@
+# Use 1d data to fill in the missing symbols relative to 1min
+
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## fill 1min data
+
+```bash
+python fill_1min_using_1d.py --data_1min_dir ~/.qlib/csv_data/cn_data_1min --qlib_data_1d_dir ~/.qlib/qlib_data/cn_data
+```
+
+## Parameters
+
+- data_1min_dir: csv data
+- qlib_data_1d_dir: qlib data directory
+- max_workers: `ThreadPoolExecutor(max_workers=max_workers)`, by default *16*
+- date_field_name: date field name, by default *date*
+- symbol_field_name: symbol field name, by default *symbol*
+
--- a/src/qlib_scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py
+++ b/src/qlib_scripts/data_collector/contrib/fill_cn_1min_data/fill_cn_1min_data.py
@ -0,0 +1,100 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sys
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+
+import fire
+import qlib
+import pandas as pd
+from tqdm import tqdm
+from qlib.data import D
+from loguru import logger
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent.parent))
+from data_collector.utils import generate_minutes_calendar_from_daily
+
+
+def get_date_range(data_1min_dir: Path, max_workers: int = 16, date_field_name: str = "date"):
+    csv_files = list(data_1min_dir.glob("*.csv"))
+    min_date = None
+    max_date = None
+    with tqdm(total=len(csv_files)) as p_bar:
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            for _file, _result in zip(csv_files, executor.map(pd.read_csv, csv_files)):
+                if not _result.empty:
+                    _dates = pd.to_datetime(_result[date_field_name])
+
+                    _tmp_min = _dates.min()
+                    min_date = min(min_date, _tmp_min) if min_date is not None else _tmp_min
+                    _tmp_max = _dates.max()
+                    max_date = max(max_date, _tmp_max) if max_date is not None else _tmp_max
+                p_bar.update()
+    return min_date, max_date
+
+
+def get_symbols(data_1min_dir: Path):
+    return list(map(lambda x: x.name[:-4].upper(), data_1min_dir.glob("*.csv")))
+
+
+def fill_1min_using_1d(
+    data_1min_dir: [str, Path],
+    qlib_data_1d_dir: [str, Path],
+    max_workers: int = 16,
+    date_field_name: str = "date",
+    symbol_field_name: str = "symbol",
+):
+    """Use 1d data to fill in the missing symbols relative to 1min
+
+    Parameters
+    ----------
+    data_1min_dir: str
+        1min data dir
+    qlib_data_1d_dir: str
+        1d qlib data(bin data) dir, from: https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format
+    max_workers: int
+        ThreadPoolExecutor(max_workers), by default 16
+    date_field_name: str
+        date field name, by default date
+    symbol_field_name: str
+        symbol field name, by default symbol
+
+    """
+    data_1min_dir = Path(data_1min_dir).expanduser().resolve()
+    qlib_data_1d_dir = Path(qlib_data_1d_dir).expanduser().resolve()
+
+    min_date, max_date = get_date_range(data_1min_dir, max_workers, date_field_name)
+    symbols_1min = get_symbols(data_1min_dir)
+
+    qlib.init(provider_uri=str(qlib_data_1d_dir))
+    data_1d = D.features(D.instruments("all"), ["$close"], min_date, max_date, freq="day")
+
+    miss_symbols = set(data_1d.index.get_level_values(level="instrument").unique()) - set(symbols_1min)
+    if not miss_symbols:
+        logger.warning("More symbols in 1min than 1d, no padding required")
+        return
+
+    logger.info(f"miss_symbols  {len(miss_symbols)}: {miss_symbols}")
+    tmp_df = pd.read_csv(list(data_1min_dir.glob("*.csv"))[0])
+    columns = tmp_df.columns
+    _si = tmp_df[symbol_field_name].first_valid_index()
+    is_lower = tmp_df.loc[_si][symbol_field_name].islower()
+    for symbol in tqdm(miss_symbols):
+        if is_lower:
+            symbol = symbol.lower()
+        index_1d = data_1d.loc(axis=0)[symbol.upper()].index
+        index_1min = generate_minutes_calendar_from_daily(index_1d)
+        index_1min.name = date_field_name
+        _df = pd.DataFrame(columns=columns, index=index_1min)
+        if date_field_name in _df.columns:
+            del _df[date_field_name]
+        _df.reset_index(inplace=True)
+        _df[symbol_field_name] = symbol
+        _df["paused_num"] = 0
+        _df.to_csv(data_1min_dir.joinpath(f"{symbol}.csv"), index=False)
+
+
+if __name__ == "__main__":
+    fire.Fire(fill_1min_using_1d)
--- a/src/qlib_scripts/data_collector/contrib/fill_cn_1min_data/requirements.txt
+++ b/src/qlib_scripts/data_collector/contrib/fill_cn_1min_data/requirements.txt
@ -0,0 +1,5 @@
+fire
+pandas
+loguru
+tqdm
+pyqlib
--- a/src/qlib_scripts/data_collector/contrib/future_trading_date_collector/README.md
+++ b/src/qlib_scripts/data_collector/contrib/future_trading_date_collector/README.md
@ -0,0 +1,24 @@
+# Get future trading days
+
+> `D.calendar(future=True)` will be used
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Collector Data
+
+```bash
+# parse instruments, using in qlib/instruments.
+python future_trading_date_collector.py --qlib_dir ~/.qlib/qlib_data/cn_data --freq day
+```
+
+## Parameters
+
+- qlib_dir: qlib data directory
+- freq: value from [`day`, `1min`], default `day`
+
+
+
--- a/src/qlib_scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py
+++ b/src/qlib_scripts/data_collector/contrib/future_trading_date_collector/future_trading_date_collector.py
@ -0,0 +1,88 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import sys
+from typing import List
+from pathlib import Path
+
+import fire
+import numpy as np
+import pandas as pd
+from loguru import logger
+
+# get data from baostock
+import baostock as bs
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent.parent))
+
+
+from data_collector.utils import generate_minutes_calendar_from_daily
+
+
+def read_calendar_from_qlib(qlib_dir: Path) -> pd.DataFrame:
+    calendar_path = qlib_dir.joinpath("calendars").joinpath("day.txt")
+    if not calendar_path.exists():
+        return pd.DataFrame()
+    return pd.read_csv(calendar_path, header=None)
+
+
+def write_calendar_to_qlib(qlib_dir: Path, date_list: List[str], freq: str = "day"):
+    calendar_path = str(qlib_dir.joinpath("calendars").joinpath(f"{freq}_future.txt"))
+
+    np.savetxt(calendar_path, date_list, fmt="%s", encoding="utf-8")
+    logger.info(f"write future calendars success: {calendar_path}")
+
+
+def generate_qlib_calendar(date_list: List[str], freq: str) -> List[str]:
+    print(freq)
+    if freq == "day":
+        return date_list
+    elif freq == "1min":
+        date_list = generate_minutes_calendar_from_daily(date_list, freq=freq).tolist()
+        return list(map(lambda x: pd.Timestamp(x).strftime("%Y-%m-%d %H:%M:%S"), date_list))
+    else:
+        raise ValueError(f"Unsupported freq: {freq}")
+
+
+def future_calendar_collector(qlib_dir: [str, Path], freq: str = "day"):
+    """get future calendar
+
+    Parameters
+    ----------
+    qlib_dir: str or Path
+        qlib data directory
+    freq: str
+        value from ["day", "1min"], by default day
+    """
+    qlib_dir = Path(qlib_dir).expanduser().resolve()
+    if not qlib_dir.exists():
+        raise FileNotFoundError(str(qlib_dir))
+
+    lg = bs.login()
+    if lg.error_code != "0":
+        logger.error(f"login error: {lg.error_msg}")
+        return
+    # read daily calendar
+    daily_calendar = read_calendar_from_qlib(qlib_dir)
+    end_year = pd.Timestamp.now().year
+    if daily_calendar.empty:
+        start_year = pd.Timestamp.now().year
+    else:
+        start_year = pd.Timestamp(daily_calendar.iloc[-1, 0]).year
+    rs = bs.query_trade_dates(start_date=pd.Timestamp(f"{start_year}-01-01"), end_date=f"{end_year}-12-31")
+    data_list = []
+    while (rs.error_code == "0") & rs.next():
+        _row_data = rs.get_row_data()
+        if int(_row_data[1]) == 1:
+            data_list.append(_row_data[0])
+    data_list = sorted(data_list)
+    date_list = generate_qlib_calendar(data_list, freq=freq)
+    date_list = sorted(set(daily_calendar.loc[:, 0].values.tolist() + date_list))
+    write_calendar_to_qlib(qlib_dir, date_list, freq=freq)
+    bs.logout()
+    logger.info(f"get trading dates success: {start_year}-01-01 to {end_year}-12-31")
+
+
+if __name__ == "__main__":
+    fire.Fire(future_calendar_collector)
--- a/src/qlib_scripts/data_collector/contrib/future_trading_date_collector/requirements.txt
+++ b/src/qlib_scripts/data_collector/contrib/future_trading_date_collector/requirements.txt
@ -0,0 +1,5 @@
+baostock
+fire
+numpy
+pandas
+loguru
--- a/src/qlib_scripts/data_collector/crypto/README.md
+++ b/src/qlib_scripts/data_collector/crypto/README.md
@ -0,0 +1,54 @@
+# Collect Crypto Data
+
+> *Please pay **ATTENTION** that the data is collected from [Coingecko](https://www.coingecko.com/en/api) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Usage of the dataset
+> *Crypto dateset only support Data retrieval function but not support backtest function due to the lack of OHLC data.*
+
+## Collector Data
+
+
+### Crypto Data
+
+#### 1d from Coingecko
+
+```bash
+
+# download from https://api.coingecko.com/api/v3/
+python collector.py download_data --source_dir ~/.qlib/crypto_data/source/1d --start 2015-01-01 --end 2021-11-30 --delay 1 --interval 1d
+
+# normalize
+python collector.py normalize_data --source_dir ~/.qlib/crypto_data/source/1d --normalize_dir ~/.qlib/crypto_data/source/1d_nor --interval 1d --date_field_name date
+
+# dump data
+cd qlib/scripts
+python dump_bin.py dump_all --csv_path ~/.qlib/crypto_data/source/1d_nor --qlib_dir ~/.qlib/qlib_data/crypto_data --freq day --date_field_name date --include_fields prices,total_volumes,market_caps
+
+```
+
+### using data
+
+```python
+import qlib
+from qlib.data import D
+
+qlib.init(provider_uri="~/.qlib/qlib_data/crypto_data")
+df = D.features(D.instruments(market="all"), ["$prices", "$total_volumes","$market_caps"], freq="day")
+```
+
+
+### Help
+```bash
+python collector.py collector_data --help
+```
+
+## Parameters
+
+- interval: 1d
+- delay: 1
--- a/src/qlib_scripts/data_collector/crypto/collector.py
+++ b/src/qlib_scripts/data_collector/crypto/collector.py
@ -0,0 +1,311 @@
+import abc
+import sys
+import datetime
+from abc import ABC
+from pathlib import Path
+
+import fire
+import requests
+import pandas as pd
+from loguru import logger
+from dateutil.tz import tzlocal
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent))
+from data_collector.base import BaseCollector, BaseNormalize, BaseRun
+from data_collector.utils import deco_retry
+
+from pycoingecko import CoinGeckoAPI
+from time import mktime
+from datetime import datetime as dt
+import time
+
+
+_CG_CRYPTO_SYMBOLS = None
+
+
+def get_cg_crypto_symbols(qlib_data_path: [str, Path] = None) -> list:
+    """get crypto symbols in coingecko
+
+    Returns
+    -------
+        crypto symbols in given exchanges list of coingecko
+    """
+    global _CG_CRYPTO_SYMBOLS
+
+    @deco_retry
+    def _get_coingecko():
+        try:
+            cg = CoinGeckoAPI()
+            resp = pd.DataFrame(cg.get_coins_markets(vs_currency="usd"))
+        except:
+            raise ValueError("request error")
+        try:
+            _symbols = resp["id"].to_list()
+        except Exception as e:
+            logger.warning(f"request error: {e}")
+            raise
+        return _symbols
+
+    if _CG_CRYPTO_SYMBOLS is None:
+        _all_symbols = _get_coingecko()
+
+        _CG_CRYPTO_SYMBOLS = sorted(set(_all_symbols))
+
+    return _CG_CRYPTO_SYMBOLS
+
+
+class CryptoCollector(BaseCollector):
+    def __init__(
+        self,
+        save_dir: [str, Path],
+        start=None,
+        end=None,
+        interval="1d",
+        max_workers=1,
+        max_collector_count=2,
+        delay=1,  # delay need to be one
+        check_data_length: int = None,
+        limit_nums: int = None,
+    ):
+        """
+
+        Parameters
+        ----------
+        save_dir: str
+            crypto save dir
+        max_workers: int
+            workers, default 4
+        max_collector_count: int
+            default 2
+        delay: float
+            time.sleep(delay), default 0
+        interval: str
+            freq, value from [1min, 1d], default 1min
+        start: str
+            start datetime, default None
+        end: str
+            end datetime, default None
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
+        limit_nums: int
+            using for debug, by default None
+        """
+        super(CryptoCollector, self).__init__(
+            save_dir=save_dir,
+            start=start,
+            end=end,
+            interval=interval,
+            max_workers=max_workers,
+            max_collector_count=max_collector_count,
+            delay=delay,
+            check_data_length=check_data_length,
+            limit_nums=limit_nums,
+        )
+
+        self.init_datetime()
+
+    def init_datetime(self):
+        if self.interval == self.INTERVAL_1min:
+            self.start_datetime = max(self.start_datetime, self.DEFAULT_START_DATETIME_1MIN)
+        elif self.interval == self.INTERVAL_1d:
+            pass
+        else:
+            raise ValueError(f"interval error: {self.interval}")
+
+        self.start_datetime = self.convert_datetime(self.start_datetime, self._timezone)
+        self.end_datetime = self.convert_datetime(self.end_datetime, self._timezone)
+
+    @staticmethod
+    def convert_datetime(dt: [pd.Timestamp, datetime.date, str], timezone):
+        try:
+            dt = pd.Timestamp(dt, tz=timezone).timestamp()
+            dt = pd.Timestamp(dt, tz=tzlocal(), unit="s")
+        except ValueError as e:
+            pass
+        return dt
+
+    @property
+    @abc.abstractmethod
+    def _timezone(self):
+        raise NotImplementedError("rewrite get_timezone")
+
+    @staticmethod
+    def get_data_from_remote(symbol, interval, start, end):
+        error_msg = f"{symbol}-{interval}-{start}-{end}"
+        try:
+            cg = CoinGeckoAPI()
+            data = cg.get_coin_market_chart_by_id(id=symbol, vs_currency="usd", days="max")
+            _resp = pd.DataFrame(columns=["date"] + list(data.keys()))
+            _resp["date"] = [dt.fromtimestamp(mktime(time.localtime(x[0] / 1000))) for x in data["prices"]]
+            for key in data.keys():
+                _resp[key] = [x[1] for x in data[key]]
+            _resp["date"] = pd.to_datetime(_resp["date"])
+            _resp["date"] = [x.date() for x in _resp["date"]]
+            _resp = _resp[(_resp["date"] < pd.to_datetime(end).date()) & (_resp["date"] > pd.to_datetime(start).date())]
+            if _resp.shape[0] != 0:
+                _resp = _resp.reset_index()
+            if isinstance(_resp, pd.DataFrame):
+                return _resp.reset_index()
+        except Exception as e:
+            logger.warning(f"{error_msg}:{e}")
+
+    def get_data(
+        self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp
+    ) -> [pd.DataFrame]:
+        def _get_simple(start_, end_):
+            self.sleep()
+            _remote_interval = interval
+            return self.get_data_from_remote(
+                symbol,
+                interval=_remote_interval,
+                start=start_,
+                end=end_,
+            )
+
+        if interval == self.INTERVAL_1d:
+            _result = _get_simple(start_datetime, end_datetime)
+        else:
+            raise ValueError(f"cannot support {interval}")
+        return _result
+
+
+class CryptoCollector1d(CryptoCollector, ABC):
+    def get_instrument_list(self):
+        logger.info("get coingecko crypto symbols......")
+        symbols = get_cg_crypto_symbols()
+        logger.info(f"get {len(symbols)} symbols.")
+        return symbols
+
+    def normalize_symbol(self, symbol):
+        return symbol
+
+    @property
+    def _timezone(self):
+        return "Asia/Shanghai"
+
+
+class CryptoNormalize(BaseNormalize):
+    DAILY_FORMAT = "%Y-%m-%d"
+
+    @staticmethod
+    def normalize_crypto(
+        df: pd.DataFrame,
+        calendar_list: list = None,
+        date_field_name: str = "date",
+        symbol_field_name: str = "symbol",
+    ):
+        if df.empty:
+            return df
+        df = df.copy()
+        df.set_index(date_field_name, inplace=True)
+        df.index = pd.to_datetime(df.index)
+        df = df[~df.index.duplicated(keep="first")]
+        if calendar_list is not None:
+            df = df.reindex(
+                pd.DataFrame(index=calendar_list)
+                .loc[
+                    pd.Timestamp(df.index.min()).date() : pd.Timestamp(df.index.max()).date()
+                    + pd.Timedelta(hours=23, minutes=59)
+                ]
+                .index
+            )
+        df.sort_index(inplace=True)
+
+        df.index.names = [date_field_name]
+        return df.reset_index()
+
+    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = self.normalize_crypto(df, self._calendar_list, self._date_field_name, self._symbol_field_name)
+        return df
+
+
+class CryptoNormalize1d(CryptoNormalize):
+    def _get_calendar_list(self):
+        return None
+
+
+class Run(BaseRun):
+    def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d"):
+        """
+
+        Parameters
+        ----------
+        source_dir: str
+            The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
+        normalize_dir: str
+            Directory for normalize data, default "Path(__file__).parent/normalize"
+        max_workers: int
+            Concurrent number, default is 1
+        interval: str
+            freq, value from [1min, 1d], default 1d
+        """
+        super().__init__(source_dir, normalize_dir, max_workers, interval)
+
+    @property
+    def collector_class_name(self):
+        return f"CryptoCollector{self.interval}"
+
+    @property
+    def normalize_class_name(self):
+        return f"CryptoNormalize{self.interval}"
+
+    @property
+    def default_base_dir(self) -> [Path, str]:
+        return CUR_DIR
+
+    def download_data(
+        self,
+        max_collector_count=2,
+        delay=0,
+        start=None,
+        end=None,
+        check_data_length: int = None,
+        limit_nums=None,
+    ):
+        """download data from Internet
+
+        Parameters
+        ----------
+        max_collector_count: int
+            default 2
+        delay: float
+            time.sleep(delay), default 0
+        interval: str
+            freq, value from [1min, 1d], default 1d, currently only supprot 1d
+        start: str
+            start datetime, default "2000-01-01"
+        end: str
+            end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))``
+        check_data_length: int # if this param useful?
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
+        limit_nums: int
+            using for debug, by default None
+
+        Examples
+        ---------
+            # get daily data
+            $ python collector.py download_data --source_dir ~/.qlib/crypto_data/source/1d --start 2015-01-01 --end 2021-11-30 --delay 1 --interval 1d
+        """
+
+        super(Run, self).download_data(max_collector_count, delay, start, end, check_data_length, limit_nums)
+
+    def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol"):
+        """normalize data
+
+        Parameters
+        ----------
+        date_field_name: str
+            date field name, default date
+        symbol_field_name: str
+            symbol field name, default symbol
+
+        Examples
+        ---------
+            $ python collector.py normalize_data --source_dir ~/.qlib/crypto_data/source/1d --normalize_dir ~/.qlib/crypto_data/source/1d_nor --interval 1d --date_field_name date
+        """
+        super(Run, self).normalize_data(date_field_name, symbol_field_name)
+
+
+if __name__ == "__main__":
+    fire.Fire(Run)
--- a/src/qlib_scripts/data_collector/crypto/requirement.txt
+++ b/src/qlib_scripts/data_collector/crypto/requirement.txt
@ -0,0 +1,8 @@
+loguru
+fire
+requests
+numpy
+pandas
+tqdm
+lxml
+pycoingecko
--- a/src/qlib_scripts/data_collector/fund/README.md
+++ b/src/qlib_scripts/data_collector/fund/README.md
@ -0,0 +1,51 @@
+# Collect Fund Data
+
+> *Please pay **ATTENTION** that the data is collected from [天天基金网](https://fund.eastmoney.com/) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Collector Data
+
+
+### CN Data
+
+#### 1d from East Money
+
+```bash
+
+# download from eastmoney.com
+python collector.py download_data --source_dir ~/.qlib/fund_data/source/cn_data --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
+
+# normalize
+python collector.py normalize_data --source_dir ~/.qlib/fund_data/source/cn_data --normalize_dir ~/.qlib/fund_data/source/cn_1d_nor --region CN --interval 1d --date_field_name FSRQ
+
+# dump data
+cd qlib/scripts
+python dump_bin.py dump_all --csv_path ~/.qlib/fund_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_fund_data --freq day --date_field_name FSRQ --include_fields DWJZ,LJJZ
+
+```
+
+### using data
+
+```python
+import qlib
+from qlib.data import D
+
+qlib.init(provider_uri="~/.qlib/qlib_data/cn_fund_data")
+df = D.features(D.instruments(market="all"), ["$DWJZ", "$LJJZ"], freq="day")
+```
+
+
+### Help
+```bash
+pythono collector.py collector_data --help
+```
+
+## Parameters
+
+- interval: 1d
+- region: CN
--- a/src/qlib_scripts/data_collector/fund/collector.py
+++ b/src/qlib_scripts/data_collector/fund/collector.py
@ -0,0 +1,304 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import abc
+import sys
+import datetime
+import json
+from abc import ABC
+from pathlib import Path
+
+import fire
+import requests
+import pandas as pd
+from loguru import logger
+from dateutil.tz import tzlocal
+from qlib.constant import REG_CN as REGION_CN
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent))
+from data_collector.base import BaseCollector, BaseNormalize, BaseRun
+from data_collector.utils import get_calendar_list, get_en_fund_symbols
+
+INDEX_BENCH_URL = "http://api.fund.eastmoney.com/f10/lsjz?callback=jQuery_&fundCode={index_code}&pageIndex=1&pageSize={numberOfHistoricalDaysToCrawl}&startDate={startDate}&endDate={endDate}"
+
+
+class FundCollector(BaseCollector):
+    def __init__(
+        self,
+        save_dir: [str, Path],
+        start=None,
+        end=None,
+        interval="1d",
+        max_workers=4,
+        max_collector_count=2,
+        delay=0,
+        check_data_length: int = None,
+        limit_nums: int = None,
+    ):
+        """
+
+        Parameters
+        ----------
+        save_dir: str
+            fund save dir
+        max_workers: int
+            workers, default 4
+        max_collector_count: int
+            default 2
+        delay: float
+            time.sleep(delay), default 0
+        interval: str
+            freq, value from [1min, 1d], default 1min
+        start: str
+            start datetime, default None
+        end: str
+            end datetime, default None
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
+        limit_nums: int
+            using for debug, by default None
+        """
+        super(FundCollector, self).__init__(
+            save_dir=save_dir,
+            start=start,
+            end=end,
+            interval=interval,
+            max_workers=max_workers,
+            max_collector_count=max_collector_count,
+            delay=delay,
+            check_data_length=check_data_length,
+            limit_nums=limit_nums,
+        )
+
+        self.init_datetime()
+
+    def init_datetime(self):
+        if self.interval == self.INTERVAL_1min:
+            self.start_datetime = max(self.start_datetime, self.DEFAULT_START_DATETIME_1MIN)
+        elif self.interval == self.INTERVAL_1d:
+            pass
+        else:
+            raise ValueError(f"interval error: {self.interval}")
+
+        self.start_datetime = self.convert_datetime(self.start_datetime, self._timezone)
+        self.end_datetime = self.convert_datetime(self.end_datetime, self._timezone)
+
+    @staticmethod
+    def convert_datetime(dt: [pd.Timestamp, datetime.date, str], timezone):
+        try:
+            dt = pd.Timestamp(dt, tz=timezone).timestamp()
+            dt = pd.Timestamp(dt, tz=tzlocal(), unit="s")
+        except ValueError as e:
+            pass
+        return dt
+
+    @property
+    @abc.abstractmethod
+    def _timezone(self):
+        raise NotImplementedError("rewrite get_timezone")
+
+    @staticmethod
+    def get_data_from_remote(symbol, interval, start, end):
+        error_msg = f"{symbol}-{interval}-{start}-{end}"
+
+        try:
+            # TODO: numberOfHistoricalDaysToCrawl should be bigger enough
+            url = INDEX_BENCH_URL.format(
+                index_code=symbol, numberOfHistoricalDaysToCrawl=10000, startDate=start, endDate=end
+            )
+            resp = requests.get(url, headers={"referer": "http://fund.eastmoney.com/110022.html"})
+
+            if resp.status_code != 200:
+                raise ValueError("request error")
+
+            data = json.loads(resp.text.split("(")[-1].split(")")[0])
+
+            # Some funds don't show the net value, example: http://fundf10.eastmoney.com/jjjz_010288.html
+            SYType = data["Data"]["SYType"]
+            if (SYType == "每万份收益") or (SYType == "每百份收益") or (SYType == "每百万份收益"):
+                raise Exception("The fund contains 每*份收益")
+
+            # TODO: should we sort the value by datetime?
+            _resp = pd.DataFrame(data["Data"]["LSJZList"])
+
+            if isinstance(_resp, pd.DataFrame):
+                return _resp.reset_index()
+        except Exception as e:
+            logger.warning(f"{error_msg}:{e}")
+
+    def get_data(
+        self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp
+    ) -> [pd.DataFrame]:
+        def _get_simple(start_, end_):
+            self.sleep()
+            _remote_interval = interval
+            return self.get_data_from_remote(
+                symbol,
+                interval=_remote_interval,
+                start=start_,
+                end=end_,
+            )
+
+        if interval == self.INTERVAL_1d:
+            _result = _get_simple(start_datetime, end_datetime)
+        else:
+            raise ValueError(f"cannot support {interval}")
+        return _result
+
+
+class FundollectorCN(FundCollector, ABC):
+    def get_instrument_list(self):
+        logger.info("get cn fund symbols......")
+        symbols = get_en_fund_symbols()
+        logger.info(f"get {len(symbols)} symbols.")
+        return symbols
+
+    def normalize_symbol(self, symbol):
+        return symbol
+
+    @property
+    def _timezone(self):
+        return "Asia/Shanghai"
+
+
+class FundCollectorCN1d(FundollectorCN):
+    pass
+
+
+class FundNormalize(BaseNormalize):
+    DAILY_FORMAT = "%Y-%m-%d"
+
+    @staticmethod
+    def normalize_fund(
+        df: pd.DataFrame,
+        calendar_list: list = None,
+        date_field_name: str = "date",
+        symbol_field_name: str = "symbol",
+    ):
+        if df.empty:
+            return df
+        df = df.copy()
+        df.set_index(date_field_name, inplace=True)
+        df.index = pd.to_datetime(df.index)
+        df = df[~df.index.duplicated(keep="first")]
+        if calendar_list is not None:
+            df = df.reindex(
+                pd.DataFrame(index=calendar_list)
+                .loc[
+                    pd.Timestamp(df.index.min()).date() : pd.Timestamp(df.index.max()).date()
+                    + pd.Timedelta(hours=23, minutes=59)
+                ]
+                .index
+            )
+        df.sort_index(inplace=True)
+
+        df.index.names = [date_field_name]
+        return df.reset_index()
+
+    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
+        # normalize
+        df = self.normalize_fund(df, self._calendar_list, self._date_field_name, self._symbol_field_name)
+        return df
+
+
+class FundNormalize1d(FundNormalize):
+    pass
+
+
+class FundNormalizeCN:
+    def _get_calendar_list(self):
+        return get_calendar_list("ALL")
+
+
+class FundNormalizeCN1d(FundNormalizeCN, FundNormalize1d):
+    pass
+
+
+class Run(BaseRun):
+    def __init__(self, source_dir=None, normalize_dir=None, max_workers=4, interval="1d", region=REGION_CN):
+        """
+
+        Parameters
+        ----------
+        source_dir: str
+            The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
+        normalize_dir: str
+            Directory for normalize data, default "Path(__file__).parent/normalize"
+        max_workers: int
+            Concurrent number, default is 4
+        interval: str
+            freq, value from [1min, 1d], default 1d
+        region: str
+            region, value from ["CN"], default "CN"
+        """
+        super().__init__(source_dir, normalize_dir, max_workers, interval)
+        self.region = region
+
+    @property
+    def collector_class_name(self):
+        return f"FundCollector{self.region.upper()}{self.interval}"
+
+    @property
+    def normalize_class_name(self):
+        return f"FundNormalize{self.region.upper()}{self.interval}"
+
+    @property
+    def default_base_dir(self) -> [Path, str]:
+        return CUR_DIR
+
+    def download_data(
+        self,
+        max_collector_count=2,
+        delay=0,
+        start=None,
+        end=None,
+        check_data_length: int = None,
+        limit_nums=None,
+    ):
+        """download data from Internet
+
+        Parameters
+        ----------
+        max_collector_count: int
+            default 2
+        delay: float
+            time.sleep(delay), default 0
+        interval: str
+            freq, value from [1min, 1d], default 1d
+        start: str
+            start datetime, default "2000-01-01"
+        end: str
+            end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))``
+        check_data_length: int # if this param useful?
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
+        limit_nums: int
+            using for debug, by default None
+
+        Examples
+        ---------
+            # get daily data
+            $ python collector.py download_data --source_dir ~/.qlib/fund_data/source/cn_data --region CN --start 2020-11-01 --end 2020-11-10 --delay 0.1 --interval 1d
+        """
+
+        super(Run, self).download_data(max_collector_count, delay, start, end, check_data_length, limit_nums)
+
+    def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol"):
+        """normalize data
+
+        Parameters
+        ----------
+        date_field_name: str
+            date field name, default date
+        symbol_field_name: str
+            symbol field name, default symbol
+
+        Examples
+        ---------
+            $ python collector.py normalize_data --source_dir ~/.qlib/fund_data/source/cn_data --normalize_dir ~/.qlib/fund_data/source/cn_1d_nor --region CN --interval 1d --date_field_name FSRQ
+        """
+        super(Run, self).normalize_data(date_field_name, symbol_field_name)
+
+
+if __name__ == "__main__":
+    fire.Fire(Run)
--- a/src/qlib_scripts/data_collector/fund/requirements.txt
+++ b/src/qlib_scripts/data_collector/fund/requirements.txt
@ -0,0 +1,10 @@
+loguru
+fire
+requests
+numpy
+pandas
+tqdm
+lxml
+loguru
+yahooquery
+json
--- a/src/qlib_scripts/data_collector/future_calendar_collector.py
+++ b/src/qlib_scripts/data_collector/future_calendar_collector.py
@ -0,0 +1,121 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import abc
+import importlib
+from pathlib import Path
+from typing import Union, Iterable, List
+
+import fire
+import numpy as np
+import pandas as pd
+
+# pip install baostock
+import baostock as bs
+from loguru import logger
+
+
+class CollectorFutureCalendar:
+    calendar_format = "%Y-%m-%d"
+
+    def __init__(self, qlib_dir: Union[str, Path], start_date: str = None, end_date: str = None):
+        """
+
+        Parameters
+        ----------
+        qlib_dir:
+            qlib data directory
+        start_date
+            start date
+        end_date
+            end date
+        """
+        self.qlib_dir = Path(qlib_dir).expanduser().absolute()
+        self.calendar_path = self.qlib_dir.joinpath("calendars/day.txt")
+        self.future_path = self.qlib_dir.joinpath("calendars/day_future.txt")
+        self._calendar_list = self.calendar_list
+        _latest_date = self._calendar_list[-1]
+        self.start_date = _latest_date if start_date is None else pd.Timestamp(start_date)
+        self.end_date = _latest_date + pd.Timedelta(days=365 * 2) if end_date is None else pd.Timestamp(end_date)
+
+    @property
+    def calendar_list(self) -> List[pd.Timestamp]:
+        # load old calendar
+        if not self.calendar_path.exists():
+            raise ValueError(f"calendar does not exist: {self.calendar_path}")
+        calendar_df = pd.read_csv(self.calendar_path, header=None)
+        calendar_df.columns = ["date"]
+        calendar_df["date"] = pd.to_datetime(calendar_df["date"])
+        return calendar_df["date"].to_list()
+
+    def _format_datetime(self, datetime_d: [str, pd.Timestamp]):
+        datetime_d = pd.Timestamp(datetime_d)
+        return datetime_d.strftime(self.calendar_format)
+
+    def write_calendar(self, calendar: Iterable):
+        calendars_list = list(map(lambda x: self._format_datetime(x), sorted(set(self.calendar_list + calendar))))
+        np.savetxt(self.future_path, calendars_list, fmt="%s", encoding="utf-8")
+
+    @abc.abstractmethod
+    def collector(self) -> Iterable[pd.Timestamp]:
+        """
+
+        Returns
+        -------
+
+        """
+        raise NotImplementedError(f"Please implement the `collector` method")
+
+
+class CollectorFutureCalendarCN(CollectorFutureCalendar):
+    def collector(self) -> Iterable[pd.Timestamp]:
+        lg = bs.login()
+        if lg.error_code != "0":
+            raise ValueError(f"login respond error_msg: {lg.error_msg}")
+        rs = bs.query_trade_dates(
+            start_date=self._format_datetime(self.start_date), end_date=self._format_datetime(self.end_date)
+        )
+        if rs.error_code != "0":
+            raise ValueError(f"query_trade_dates respond error_msg: {rs.error_msg}")
+        data_list = []
+        while (rs.error_code == "0") & rs.next():
+            data_list.append(rs.get_row_data())
+        calendar = pd.DataFrame(data_list, columns=rs.fields)
+        calendar["is_trading_day"] = calendar["is_trading_day"].astype(int)
+        return pd.to_datetime(calendar[calendar["is_trading_day"] == 1]["calendar_date"]).to_list()
+
+
+class CollectorFutureCalendarUS(CollectorFutureCalendar):
+    def collector(self) -> Iterable[pd.Timestamp]:
+        # TODO: US future calendar
+        raise ValueError("Us calendar is not supported")
+
+
+def run(qlib_dir: Union[str, Path], region: str = "cn", start_date: str = None, end_date: str = None):
+    """Collect future calendar(day)
+
+    Parameters
+    ----------
+    qlib_dir:
+        qlib data directory
+    region:
+        cn/CN or us/US
+    start_date
+        start date
+    end_date
+        end date
+
+    Examples
+    -------
+        # get cn future calendar
+        $ python future_calendar_collector.py --qlib_data_1d_dir <user data dir> --region cn
+    """
+    logger.info(f"collector future calendar: region={region}")
+    _cur_module = importlib.import_module("future_calendar_collector")
+    _class = getattr(_cur_module, f"CollectorFutureCalendar{region.upper()}")
+    collector = _class(qlib_dir=qlib_dir, start_date=start_date, end_date=end_date)
+    collector.write_calendar(collector.collector())
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
--- a/src/qlib_scripts/data_collector/index.py
+++ b/src/qlib_scripts/data_collector/index.py
@ -0,0 +1,238 @@
+import sys
+import abc
+from pathlib import Path
+from typing import List
+
+import pandas as pd
+from tqdm import tqdm
+from loguru import logger
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent))
+
+
+from data_collector.utils import get_trading_date_by_shift
+
+
+class IndexBase:
+    DEFAULT_END_DATE = pd.Timestamp("2099-12-31")
+    SYMBOL_FIELD_NAME = "symbol"
+    DATE_FIELD_NAME = "date"
+    START_DATE_FIELD = "start_date"
+    END_DATE_FIELD = "end_date"
+    CHANGE_TYPE_FIELD = "type"
+    INSTRUMENTS_COLUMNS = [SYMBOL_FIELD_NAME, START_DATE_FIELD, END_DATE_FIELD]
+    REMOVE = "remove"
+    ADD = "add"
+    INST_PREFIX = ""
+
+    def __init__(
+        self,
+        index_name: str,
+        qlib_dir: [str, Path] = None,
+        freq: str = "day",
+        request_retry: int = 5,
+        retry_sleep: int = 3,
+    ):
+        """
+
+        Parameters
+        ----------
+        index_name: str
+            index name
+        qlib_dir: str
+            qlib directory, by default Path(__file__).resolve().parent.joinpath("qlib_data")
+        freq: str
+            freq, value from ["day", "1min"]
+        request_retry: int
+            request retry, by default 5
+        retry_sleep: int
+            request sleep, by default 3
+        """
+        self.index_name = index_name
+        if qlib_dir is None:
+            qlib_dir = Path(__file__).resolve().parent.joinpath("qlib_data")
+        self.instruments_dir = Path(qlib_dir).expanduser().resolve().joinpath("instruments")
+        self.instruments_dir.mkdir(exist_ok=True, parents=True)
+        self.cache_dir = Path(f"~/.cache/qlib/index/{self.index_name}").expanduser().resolve()
+        self.cache_dir.mkdir(exist_ok=True, parents=True)
+        self._request_retry = request_retry
+        self._retry_sleep = retry_sleep
+        self.freq = freq
+
+    @property
+    @abc.abstractmethod
+    def bench_start_date(self) -> pd.Timestamp:
+        """
+        Returns
+        -------
+            index start date
+        """
+        raise NotImplementedError("rewrite bench_start_date")
+
+    @property
+    @abc.abstractmethod
+    def calendar_list(self) -> List[pd.Timestamp]:
+        """get history trading date
+
+        Returns
+        -------
+            calendar list
+        """
+        raise NotImplementedError("rewrite calendar_list")
+
+    @abc.abstractmethod
+    def get_new_companies(self) -> pd.DataFrame:
+        """
+
+        Returns
+        -------
+            pd.DataFrame:
+
+                symbol     start_date    end_date
+                SH600000   2000-01-01    2099-12-31
+
+            dtypes:
+                symbol: str
+                start_date: pd.Timestamp
+                end_date: pd.Timestamp
+        """
+        raise NotImplementedError("rewrite get_new_companies")
+
+    @abc.abstractmethod
+    def get_changes(self) -> pd.DataFrame:
+        """get companies changes
+
+        Returns
+        -------
+            pd.DataFrame:
+                symbol      date        type
+                SH600000  2019-11-11    add
+                SH600000  2020-11-10    remove
+            dtypes:
+                symbol: str
+                date: pd.Timestamp
+                type: str, value from ["add", "remove"]
+        """
+        raise NotImplementedError("rewrite get_changes")
+
+    @abc.abstractmethod
+    def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
+        """formatting the datetime in an instrument
+
+        Parameters
+        ----------
+        inst_df: pd.DataFrame
+            inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
+
+        Returns
+        -------
+
+        """
+        raise NotImplementedError("rewrite format_datetime")
+
+    def save_new_companies(self):
+        """save new companies
+
+        Examples
+        -------
+            $ python collector.py save_new_companies --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data
+        """
+        df = self.get_new_companies()
+        if df is None or df.empty:
+            raise ValueError(f"get new companies error: {self.index_name}")
+        df = df.drop_duplicates([self.SYMBOL_FIELD_NAME])
+        df.loc[:, self.INSTRUMENTS_COLUMNS].to_csv(
+            self.instruments_dir.joinpath(f"{self.index_name.lower()}_only_new.txt"), sep="\t", index=False, header=None
+        )
+
+    def get_changes_with_history_companies(self, history_companies: pd.DataFrame) -> pd.DataFrame:
+        """get changes with history companies
+
+        Parameters
+        ----------
+        history_companies : pd.DataFrame
+            symbol        date
+            SH600000   2020-11-11
+
+            dtypes:
+                symbol: str
+                date: pd.Timestamp
+
+        Return
+        --------
+            pd.DataFrame:
+                symbol      date        type
+                SH600000  2019-11-11    add
+                SH600000  2020-11-10    remove
+            dtypes:
+                symbol: str
+                date: pd.Timestamp
+                type: str, value from ["add", "remove"]
+
+        """
+        logger.info("parse changes from history companies......")
+        last_code = []
+        result_df_list = []
+        _columns = [self.DATE_FIELD_NAME, self.SYMBOL_FIELD_NAME, self.CHANGE_TYPE_FIELD]
+        for _trading_date in tqdm(sorted(history_companies[self.DATE_FIELD_NAME].unique(), reverse=True)):
+            _currenet_code = history_companies[history_companies[self.DATE_FIELD_NAME] == _trading_date][
+                self.SYMBOL_FIELD_NAME
+            ].tolist()
+            if last_code:
+                add_code = list(set(last_code) - set(_currenet_code))
+                remote_code = list(set(_currenet_code) - set(last_code))
+                for _code in add_code:
+                    result_df_list.append(
+                        pd.DataFrame(
+                            [[get_trading_date_by_shift(self.calendar_list, _trading_date, 1), _code, self.ADD]],
+                            columns=_columns,
+                        )
+                    )
+                for _code in remote_code:
+                    result_df_list.append(
+                        pd.DataFrame(
+                            [[get_trading_date_by_shift(self.calendar_list, _trading_date, 0), _code, self.REMOVE]],
+                            columns=_columns,
+                        )
+                    )
+            last_code = _currenet_code
+        df = pd.concat(result_df_list)
+        logger.info("end of parse changes from history companies.")
+        return df
+
+    def parse_instruments(self):
+        """parse instruments, eg: csi300.txt
+
+        Examples
+        -------
+            $ python collector.py parse_instruments --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data
+        """
+        logger.info(f"start parse {self.index_name.lower()} companies.....")
+        instruments_columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
+        changers_df = self.get_changes()
+        new_df = self.get_new_companies()
+        if new_df is None or new_df.empty:
+            raise ValueError(f"get new companies error: {self.index_name}")
+        new_df = new_df.copy()
+        logger.info("parse history companies by changes......")
+        for _row in tqdm(changers_df.sort_values(self.DATE_FIELD_NAME, ascending=False).itertuples(index=False)):
+            if _row.type == self.ADD:
+                min_end_date = new_df.loc[new_df[self.SYMBOL_FIELD_NAME] == _row.symbol, self.END_DATE_FIELD].min()
+                new_df.loc[
+                    (new_df[self.END_DATE_FIELD] == min_end_date) & (new_df[self.SYMBOL_FIELD_NAME] == _row.symbol),
+                    self.START_DATE_FIELD,
+                ] = _row.date
+            else:
+                _tmp_df = pd.DataFrame([[_row.symbol, self.bench_start_date, _row.date]], columns=instruments_columns)
+                new_df = pd.concat([new_df, _tmp_df], sort=False)
+
+        inst_df = new_df.loc[:, instruments_columns]
+        _inst_prefix = self.INST_PREFIX.strip()
+        if _inst_prefix:
+            inst_df["save_inst"] = inst_df[self.SYMBOL_FIELD_NAME].apply(lambda x: f"{_inst_prefix}{x}")
+        inst_df = self.format_datetime(inst_df)
+        inst_df.to_csv(
+            self.instruments_dir.joinpath(f"{self.index_name.lower()}.txt"), sep="\t", index=False, header=None
+        )
+        logger.info(f"parse {self.index_name.lower()} companies finished.")
--- a/src/qlib_scripts/data_collector/pit/README.md
+++ b/src/qlib_scripts/data_collector/pit/README.md
@ -0,0 +1,40 @@
+# Collect Point-in-Time Data
+
+> *Please pay **ATTENTION** that the data is collected from [baostock](http://baostock.com) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Collector Data
+
+
+### Download Quarterly CN Data
+
+```bash
+cd qlib/scripts/data_collector/pit/
+# download from baostock.com
+python collector.py download_data --source_dir ~/.qlib/stock_data/source/pit --start 2000-01-01 --end 2020-01-01 --interval quarterly
+```
+
+Downloading all data from the stock is very time consuming. If you just want run a quick test on a few stocks,  you can run the command below
+```bash
+python collector.py download_data --source_dir ~/.qlib/stock_data/source/pit --start 2000-01-01 --end 2020-01-01 --interval quarterly --symbol_regex "^(600519|000725).*"
+```
+
+
+### Normalize Data
+```bash
+python collector.py normalize_data --interval quarterly --source_dir ~/.qlib/stock_data/source/pit --normalize_dir ~/.qlib/stock_data/source/pit_normalized
+```
+
+
+
+### Dump Data into PIT Format
+
+```bash
+cd qlib/scripts
+python dump_pit.py dump --csv_path ~/.qlib/stock_data/source/pit_normalized --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly
+```
--- a/src/qlib_scripts/data_collector/pit/collector.py
+++ b/src/qlib_scripts/data_collector/pit/collector.py
@ -0,0 +1,262 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import re
+import sys
+from datetime import datetime
+from pathlib import Path
+from typing import List, Iterable, Optional, Union
+
+import fire
+import pandas as pd
+import baostock as bs
+from loguru import logger
+
+BASE_DIR = Path(__file__).resolve().parent
+sys.path.append(str(BASE_DIR.parent.parent))
+
+from data_collector.base import BaseCollector, BaseRun, BaseNormalize
+from data_collector.utils import get_hs_stock_symbols, get_calendar_list
+
+
+class PitCollector(BaseCollector):
+    DEFAULT_START_DATETIME_QUARTERLY = pd.Timestamp("2000-01-01")
+    DEFAULT_START_DATETIME_ANNUAL = pd.Timestamp("2000-01-01")
+    DEFAULT_END_DATETIME_QUARTERLY = pd.Timestamp(datetime.now() + pd.Timedelta(days=1))
+    DEFAULT_END_DATETIME_ANNUAL = pd.Timestamp(datetime.now() + pd.Timedelta(days=1))
+
+    INTERVAL_QUARTERLY = "quarterly"
+    INTERVAL_ANNUAL = "annual"
+
+    def __init__(
+        self,
+        save_dir: Union[str, Path],
+        start: Optional[str] = None,
+        end: Optional[str] = None,
+        interval: str = "quarterly",
+        max_workers: int = 1,
+        max_collector_count: int = 1,
+        delay: int = 0,
+        check_data_length: bool = False,
+        limit_nums: Optional[int] = None,
+        symbol_regex: Optional[str] = None,
+    ):
+        """
+        Parameters
+        ----------
+        save_dir: str
+            instrument save dir
+        max_workers: int
+            workers, default 1; Concurrent number, default is 1; when collecting data, it is recommended that max_workers be set to 1
+        max_collector_count: int
+            default 2
+        delay: float
+            time.sleep(delay), default 0
+        interval: str
+            freq, value from [1min, 1d], default 1d
+        start: str
+            start datetime, default None
+        end: str
+            end datetime, default None
+        check_data_length: int
+            check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
+        limit_nums: int
+            using for debug, by default None
+        symbol_regex: str
+            symbol regular expression, by default None.
+        """
+        self.symbol_regex = symbol_regex
+        super().__init__(
+            save_dir=save_dir,
+            start=start,
+            end=end,
+            interval=interval,
+            max_workers=max_workers,
+            max_collector_count=max_collector_count,
+            delay=delay,
+            check_data_length=check_data_length,
+            limit_nums=limit_nums,
+        )
+
+    def get_instrument_list(self) -> List[str]:
+        logger.info("get cn stock symbols......")
+        symbols = get_hs_stock_symbols()
+        if self.symbol_regex is not None:
+            regex_compile = re.compile(self.symbol_regex)
+            symbols = [symbol for symbol in symbols if regex_compile.match(symbol)]
+        logger.info(f"get {len(symbols)} symbols.")
+        return symbols
+
+    def normalize_symbol(self, symbol: str) -> str:
+        symbol, exchange = symbol.split(".")
+        exchange = "sh" if exchange == "ss" else "sz"
+        return f"{exchange}{symbol}"
+
+    @staticmethod
+    def get_performance_express_report_df(code: str, start_date: str, end_date: str) -> pd.DataFrame:
+        column_mapping = {
+            "performanceExpPubDate": "date",
+            "performanceExpStatDate": "period",
+            "performanceExpressROEWa": "value",
+        }
+
+        resp = bs.query_performance_express_report(code=code, start_date=start_date, end_date=end_date)
+        report_list = []
+        while (resp.error_code == "0") and resp.next():
+            report_list.append(resp.get_row_data())
+        report_df = pd.DataFrame(report_list, columns=resp.fields)
+        try:
+            report_df = report_df[list(column_mapping.keys())]
+        except KeyError:
+            return pd.DataFrame()
+        report_df.rename(columns=column_mapping, inplace=True)
+        report_df["field"] = "roeWa"
+        report_df["value"] = pd.to_numeric(report_df["value"], errors="ignore")
+        report_df["value"] = report_df["value"].apply(lambda x: x / 100.0)
+        return report_df
+
+    @staticmethod
+    def get_profit_df(code: str, start_date: str, end_date: str) -> pd.DataFrame:
+        column_mapping = {"pubDate": "date", "statDate": "period", "roeAvg": "value"}
+        fields = bs.query_profit_data(code="sh.600519", year=2020, quarter=1).fields
+        start_date = datetime.strptime(start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(end_date, "%Y-%m-%d")
+        args = [(year, quarter) for quarter in range(1, 5) for year in range(start_date.year - 1, end_date.year + 1)]
+        profit_list = []
+        for year, quarter in args:
+            resp = bs.query_profit_data(code=code, year=year, quarter=quarter)
+            while (resp.error_code == "0") and resp.next():
+                if "pubDate" not in resp.fields:
+                    continue
+                row_data = resp.get_row_data()
+                pub_date = pd.Timestamp(row_data[resp.fields.index("pubDate")])
+                if start_date <= pub_date <= end_date and row_data:
+                    profit_list.append(row_data)
+        profit_df = pd.DataFrame(profit_list, columns=fields)
+        try:
+            profit_df = profit_df[list(column_mapping.keys())]
+        except KeyError:
+            return pd.DataFrame()
+        profit_df.rename(columns=column_mapping, inplace=True)
+        profit_df["field"] = "roeWa"
+        profit_df["value"] = pd.to_numeric(profit_df["value"], errors="ignore")
+        return profit_df
+
+    @staticmethod
+    def get_forecast_report_df(code: str, start_date: str, end_date: str) -> pd.DataFrame:
+        column_mapping = {
+            "profitForcastExpPubDate": "date",
+            "profitForcastExpStatDate": "period",
+            "value": "value",
+        }
+        resp = bs.query_forecast_report(code=code, start_date=start_date, end_date=end_date)
+        forecast_list = []
+        while (resp.error_code == "0") and resp.next():
+            forecast_list.append(resp.get_row_data())
+        forecast_df = pd.DataFrame(forecast_list, columns=resp.fields)
+        numeric_fields = ["profitForcastChgPctUp", "profitForcastChgPctDwn"]
+        try:
+            forecast_df[numeric_fields] = forecast_df[numeric_fields].apply(pd.to_numeric, errors="ignore")
+        except KeyError:
+            return pd.DataFrame()
+        forecast_df["value"] = (forecast_df["profitForcastChgPctUp"] + forecast_df["profitForcastChgPctDwn"]) / 200
+        forecast_df = forecast_df[list(column_mapping.keys())]
+        forecast_df.rename(columns=column_mapping, inplace=True)
+        forecast_df["field"] = "YOYNI"
+        return forecast_df
+
+    @staticmethod
+    def get_growth_df(code: str, start_date: str, end_date: str) -> pd.DataFrame:
+        column_mapping = {"pubDate": "date", "statDate": "period", "YOYNI": "value"}
+        fields = bs.query_growth_data(code="sh.600519", year=2020, quarter=1).fields
+        start_date = datetime.strptime(start_date, "%Y-%m-%d")
+        end_date = datetime.strptime(end_date, "%Y-%m-%d")
+        args = [(year, quarter) for quarter in range(1, 5) for year in range(start_date.year - 1, end_date.year + 1)]
+        growth_list = []
+        for year, quarter in args:
+            resp = bs.query_growth_data(code=code, year=year, quarter=quarter)
+            while (resp.error_code == "0") and resp.next():
+                if "pubDate" not in resp.fields:
+                    continue
+                row_data = resp.get_row_data()
+                pub_date = pd.Timestamp(row_data[resp.fields.index("pubDate")])
+                if start_date <= pub_date <= end_date and row_data:
+                    growth_list.append(row_data)
+        growth_df = pd.DataFrame(growth_list, columns=fields)
+        try:
+            growth_df = growth_df[list(column_mapping.keys())]
+        except KeyError:
+            return pd.DataFrame()
+        growth_df.rename(columns=column_mapping, inplace=True)
+        growth_df["field"] = "YOYNI"
+        growth_df["value"] = pd.to_numeric(growth_df["value"], errors="ignore")
+        return growth_df
+
+    def get_data(
+        self,
+        symbol: str,
+        interval: str,
+        start_datetime: pd.Timestamp,
+        end_datetime: pd.Timestamp,
+    ) -> pd.DataFrame:
+        if interval != self.INTERVAL_QUARTERLY:
+            raise ValueError(f"cannot support {interval}")
+        symbol, exchange = symbol.split(".")
+        exchange = "sh" if exchange == "ss" else "sz"
+        code = f"{exchange}.{symbol}"
+        start_date = start_datetime.strftime("%Y-%m-%d")
+        end_date = end_datetime.strftime("%Y-%m-%d")
+
+        performance_express_report_df = self.get_performance_express_report_df(code, start_date, end_date)
+        profit_df = self.get_profit_df(code, start_date, end_date)
+        forecast_report_df = self.get_forecast_report_df(code, start_date, end_date)
+        growth_df = self.get_growth_df(code, start_date, end_date)
+
+        df = pd.concat(
+            [performance_express_report_df, profit_df, forecast_report_df, growth_df],
+            axis=0,
+        )
+        return df
+
+
+class PitNormalize(BaseNormalize):
+    def __init__(self, interval: str = "quarterly", *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.interval = interval
+
+    def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
+        dt = df["period"].apply(
+            lambda x: (
+                pd.to_datetime(x) + pd.DateOffset(days=(45 if self.interval == PitCollector.INTERVAL_QUARTERLY else 90))
+            ).date()
+        )
+        df["date"] = df["date"].fillna(dt.astype(str))
+
+        df["period"] = pd.to_datetime(df["period"])
+        df["period"] = df["period"].apply(
+            lambda x: x.year if self.interval == PitCollector.INTERVAL_ANNUAL else x.year * 100 + (x.month - 1) // 3 + 1
+        )
+        return df
+
+    def _get_calendar_list(self) -> Iterable[pd.Timestamp]:
+        return get_calendar_list()
+
+
+class Run(BaseRun):
+    @property
+    def collector_class_name(self) -> str:
+        return f"PitCollector"
+
+    @property
+    def normalize_class_name(self) -> str:
+        return f"PitNormalize"
+
+    @property
+    def default_base_dir(self) -> [Path, str]:
+        return BASE_DIR
+
+
+if __name__ == "__main__":
+    bs.login()
+    fire.Fire(Run)
+    bs.logout()
--- a/src/qlib_scripts/data_collector/pit/requirements.txt
+++ b/src/qlib_scripts/data_collector/pit/requirements.txt
@ -0,0 +1,10 @@
+loguru
+fire
+tqdm
+requests
+pandas
+lxml
+loguru
+baostock
+yahooquery
+beautifulsoup4
--- a/src/qlib_scripts/data_collector/us_index/README.md
+++ b/src/qlib_scripts/data_collector/us_index/README.md
@ -0,0 +1,22 @@
+# NASDAQ100/SP500/SP400/DJIA History Companies Collection
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Collector Data
+
+```bash
+# parse instruments, using in qlib/instruments.
+python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method parse_instruments
+
+# parse new companies
+python collector.py --index_name SP500 --qlib_dir ~/.qlib/qlib_data/us_data --method save_new_companies
+
+# index_name support: SP500, NASDAQ100, DJIA, SP400
+# help
+python collector.py --help
+```
+
--- a/src/qlib_scripts/data_collector/us_index/collector.py
+++ b/src/qlib_scripts/data_collector/us_index/collector.py
@ -0,0 +1,275 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import abc
+from functools import partial
+import sys
+import importlib
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+from typing import List
+
+import fire
+import requests
+import pandas as pd
+from tqdm import tqdm
+from loguru import logger
+
+
+CUR_DIR = Path(__file__).resolve().parent
+sys.path.append(str(CUR_DIR.parent.parent))
+
+from data_collector.index import IndexBase
+from data_collector.utils import deco_retry, get_calendar_list, get_trading_date_by_shift
+from data_collector.utils import get_instruments
+
+
+WIKI_URL = "https://en.wikipedia.org/wiki"
+
+WIKI_INDEX_NAME_MAP = {
+    "NASDAQ100": "NASDAQ-100",
+    "SP500": "List_of_S%26P_500_companies",
+    "SP400": "List_of_S%26P_400_companies",
+    "DJIA": "Dow_Jones_Industrial_Average",
+}
+
+
+class WIKIIndex(IndexBase):
+    # NOTE: The US stock code contains "PRN", and the directory cannot be created on Windows system, use the "_" prefix
+    # https://superuser.com/questions/613313/why-cant-we-make-con-prn-null-folder-in-windows
+    INST_PREFIX = ""
+
+    def __init__(
+        self,
+        index_name: str,
+        qlib_dir: [str, Path] = None,
+        freq: str = "day",
+        request_retry: int = 5,
+        retry_sleep: int = 3,
+    ):
+        super(WIKIIndex, self).__init__(
+            index_name=index_name, qlib_dir=qlib_dir, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
+        )
+
+        self._target_url = f"{WIKI_URL}/{WIKI_INDEX_NAME_MAP[self.index_name.upper()]}"
+
+    @property
+    @abc.abstractmethod
+    def bench_start_date(self) -> pd.Timestamp:
+        """
+        Returns
+        -------
+            index start date
+        """
+        raise NotImplementedError("rewrite bench_start_date")
+
+    @abc.abstractmethod
+    def get_changes(self) -> pd.DataFrame:
+        """get companies changes
+
+        Returns
+        -------
+            pd.DataFrame:
+                symbol      date        type
+                SH600000  2019-11-11    add
+                SH600000  2020-11-10    remove
+            dtypes:
+                symbol: str
+                date: pd.Timestamp
+                type: str, value from ["add", "remove"]
+        """
+        raise NotImplementedError("rewrite get_changes")
+
+    def format_datetime(self, inst_df: pd.DataFrame) -> pd.DataFrame:
+        """formatting the datetime in an instrument
+
+        Parameters
+        ----------
+        inst_df: pd.DataFrame
+            inst_df.columns = [self.SYMBOL_FIELD_NAME, self.START_DATE_FIELD, self.END_DATE_FIELD]
+
+        Returns
+        -------
+
+        """
+        if self.freq != "day":
+            inst_df[self.END_DATE_FIELD] = inst_df[self.END_DATE_FIELD].apply(
+                lambda x: (pd.Timestamp(x) + pd.Timedelta(hours=23, minutes=59)).strftime("%Y-%m-%d %H:%M:%S")
+            )
+        return inst_df
+
+    @property
+    def calendar_list(self) -> List[pd.Timestamp]:
+        """get history trading date
+
+        Returns
+        -------
+            calendar list
+        """
+        _calendar_list = getattr(self, "_calendar_list", None)
+        if _calendar_list is None:
+            _calendar_list = list(filter(lambda x: x >= self.bench_start_date, get_calendar_list("US_ALL")))
+            setattr(self, "_calendar_list", _calendar_list)
+        return _calendar_list
+
+    def _request_new_companies(self) -> requests.Response:
+        resp = requests.get(self._target_url)
+        if resp.status_code != 200:
+            raise ValueError(f"request error: {self._target_url}")
+
+        return resp
+
+    def set_default_date_range(self, df: pd.DataFrame) -> pd.DataFrame:
+        _df = df.copy()
+        _df[self.SYMBOL_FIELD_NAME] = _df[self.SYMBOL_FIELD_NAME].str.strip()
+        _df[self.START_DATE_FIELD] = self.bench_start_date
+        _df[self.END_DATE_FIELD] = self.DEFAULT_END_DATE
+        return _df.loc[:, self.INSTRUMENTS_COLUMNS]
+
+    def get_new_companies(self):
+        logger.info(f"get new companies {self.index_name} ......")
+        _data = deco_retry(retry=self._request_retry, retry_sleep=self._retry_sleep)(self._request_new_companies)()
+        df_list = pd.read_html(_data.text)
+        for _df in df_list:
+            _df = self.filter_df(_df)
+            if (_df is not None) and (not _df.empty):
+                _df.columns = [self.SYMBOL_FIELD_NAME]
+                _df = self.set_default_date_range(_df)
+                logger.info(f"end of get new companies {self.index_name} ......")
+                return _df
+
+    def filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        raise NotImplementedError("rewrite filter_df")
+
+
+class NASDAQ100Index(WIKIIndex):
+
+    HISTORY_COMPANIES_URL = (
+        "https://indexes.nasdaqomx.com/Index/WeightingData?id=NDX&tradeDate={trade_date}T00%3A00%3A00.000&timeOfDay=SOD"
+    )
+    MAX_WORKERS = 16
+
+    def filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        if len(df) >= 100 and "Ticker" in df.columns:
+            return df.loc[:, ["Ticker"]].copy()
+
+    @property
+    def bench_start_date(self) -> pd.Timestamp:
+        return pd.Timestamp("2003-01-02")
+
+    @deco_retry
+    def _request_history_companies(self, trade_date: pd.Timestamp, use_cache: bool = True) -> pd.DataFrame:
+        trade_date = trade_date.strftime("%Y-%m-%d")
+        cache_path = self.cache_dir.joinpath(f"{trade_date}_history_companies.pkl")
+        if cache_path.exists() and use_cache:
+            df = pd.read_pickle(cache_path)
+        else:
+            url = self.HISTORY_COMPANIES_URL.format(trade_date=trade_date)
+            resp = requests.post(url)
+            if resp.status_code != 200:
+                raise ValueError(f"request error: {url}")
+            df = pd.DataFrame(resp.json()["aaData"])
+            df[self.DATE_FIELD_NAME] = trade_date
+            df.rename(columns={"Name": "name", "Symbol": self.SYMBOL_FIELD_NAME}, inplace=True)
+            if not df.empty:
+                df.to_pickle(cache_path)
+        return df
+
+    def get_history_companies(self):
+        logger.info(f"start get history companies......")
+        all_history = []
+        error_list = []
+        with tqdm(total=len(self.calendar_list)) as p_bar:
+            with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
+                for _trading_date, _df in zip(
+                    self.calendar_list, executor.map(self._request_history_companies, self.calendar_list)
+                ):
+                    if _df.empty:
+                        error_list.append(_trading_date)
+                    else:
+                        all_history.append(_df)
+                    p_bar.update()
+
+        if error_list:
+            logger.warning(f"get error: {error_list}")
+        logger.info(f"total {len(self.calendar_list)}, error {len(error_list)}")
+        logger.info(f"end of get history companies.")
+        return pd.concat(all_history, sort=False)
+
+    def get_changes(self):
+        return self.get_changes_with_history_companies(self.get_history_companies())
+
+
+class DJIAIndex(WIKIIndex):
+    @property
+    def bench_start_date(self) -> pd.Timestamp:
+        return pd.Timestamp("2000-01-01")
+
+    def get_changes(self) -> pd.DataFrame:
+        pass
+
+    def filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        if "Symbol" in df.columns:
+            _df = df.loc[:, ["Symbol"]].copy()
+            _df["Symbol"] = _df["Symbol"].apply(lambda x: x.split(":")[-1])
+            return _df
+
+    def parse_instruments(self):
+        logger.warning(f"No suitable data source has been found!")
+
+
+class SP500Index(WIKIIndex):
+    WIKISP500_CHANGES_URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
+
+    @property
+    def bench_start_date(self) -> pd.Timestamp:
+        return pd.Timestamp("1999-01-01")
+
+    def get_changes(self) -> pd.DataFrame:
+        logger.info(f"get sp500 history changes......")
+        # NOTE: may update the index of the table
+        changes_df = pd.read_html(self.WIKISP500_CHANGES_URL)[-1]
+        changes_df = changes_df.iloc[:, [0, 1, 3]]
+        changes_df.columns = [self.DATE_FIELD_NAME, self.ADD, self.REMOVE]
+        changes_df[self.DATE_FIELD_NAME] = pd.to_datetime(changes_df[self.DATE_FIELD_NAME])
+        _result = []
+        for _type in [self.ADD, self.REMOVE]:
+            _df = changes_df.copy()
+            _df[self.CHANGE_TYPE_FIELD] = _type
+            _df[self.SYMBOL_FIELD_NAME] = _df[_type]
+            _df.dropna(subset=[self.SYMBOL_FIELD_NAME], inplace=True)
+            if _type == self.ADD:
+                _df[self.DATE_FIELD_NAME] = _df[self.DATE_FIELD_NAME].apply(
+                    lambda x: get_trading_date_by_shift(self.calendar_list, x, 0)
+                )
+            else:
+                _df[self.DATE_FIELD_NAME] = _df[self.DATE_FIELD_NAME].apply(
+                    lambda x: get_trading_date_by_shift(self.calendar_list, x, -1)
+                )
+            _result.append(_df[[self.DATE_FIELD_NAME, self.CHANGE_TYPE_FIELD, self.SYMBOL_FIELD_NAME]])
+        logger.info(f"end of get sp500 history changes.")
+        return pd.concat(_result, sort=False)
+
+    def filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        if "Symbol" in df.columns:
+            return df.loc[:, ["Symbol"]].copy()
+
+
+class SP400Index(WIKIIndex):
+    @property
+    def bench_start_date(self) -> pd.Timestamp:
+        return pd.Timestamp("2000-01-01")
+
+    def get_changes(self) -> pd.DataFrame:
+        pass
+
+    def filter_df(self, df: pd.DataFrame) -> pd.DataFrame:
+        if "Ticker symbol" in df.columns:
+            return df.loc[:, ["Ticker symbol"]].copy()
+
+    def parse_instruments(self):
+        logger.warning(f"No suitable data source has been found!")
+
+
+if __name__ == "__main__":
+    fire.Fire(partial(get_instruments, market_index="us_index"))
--- a/src/qlib_scripts/data_collector/us_index/requirements.txt
+++ b/src/qlib_scripts/data_collector/us_index/requirements.txt
@ -0,0 +1,6 @@
+logure
+fire
+requests
+pandas
+lxml
+loguru
--- a/src/qlib_scripts/data_collector/utils.py
+++ b/src/qlib_scripts/data_collector/utils.py
@ -0,0 +1,609 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
+import re
+import importlib
+import time
+import bisect
+import pickle
+import random
+import requests
+import functools
+from pathlib import Path
+from typing import Iterable, Tuple, List
+
+import numpy as np
+import pandas as pd
+from lxml import etree
+from loguru import logger
+from yahooquery import Ticker
+from tqdm import tqdm
+from functools import partial
+from concurrent.futures import ProcessPoolExecutor
+from bs4 import BeautifulSoup
+
+HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"
+
+CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231"
+SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}"
+
+CALENDAR_BENCH_URL_MAP = {
+    "CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"),
+    "CSI500": CALENDAR_URL_BASE.format(market=1, bench_code="000905"),
+    "CSI100": CALENDAR_URL_BASE.format(market=1, bench_code="000903"),
+    # NOTE: Use the time series of SH600000 as the sequence of all stocks
+    "ALL": CALENDAR_URL_BASE.format(market=1, bench_code="000905"),
+    # NOTE: Use the time series of ^GSPC(SP500) as the sequence of all stocks
+    "US_ALL": "^GSPC",
+    "IN_ALL": "^NSEI",
+    "BR_ALL": "^BVSP",
+}
+
+_BENCH_CALENDAR_LIST = None
+_ALL_CALENDAR_LIST = None
+_HS_SYMBOLS = None
+_US_SYMBOLS = None
+_IN_SYMBOLS = None
+_BR_SYMBOLS = None
+_EN_FUND_SYMBOLS = None
+_CALENDAR_MAP = {}
+
+# NOTE: Until 2020-10-20 20:00:00
+MINIMUM_SYMBOLS_NUM = 3900
+
+
+def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]:
+    """get SH/SZ history calendar list
+
+    Parameters
+    ----------
+    bench_code: str
+        value from ["CSI300", "CSI500", "ALL", "US_ALL"]
+
+    Returns
+    -------
+        history calendar list
+    """
+
+    logger.info(f"get calendar list: {bench_code}......")
+
+    def _get_calendar(url):
+        _value_list = requests.get(url).json()["data"]["klines"]
+        return sorted(map(lambda x: pd.Timestamp(x.split(",")[0]), _value_list))
+
+    calendar = _CALENDAR_MAP.get(bench_code, None)
+    if calendar is None:
+        if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"):
+            print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]))
+            print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max"))
+            df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")
+            calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist()
+        else:
+            if bench_code.upper() == "ALL":
+
+                @deco_retry
+                def _get_calendar(month):
+                    _cal = []
+                    try:
+                        resp = requests.get(SZSE_CALENDAR_URL.format(month=month, random=random.random)).json()
+                        for _r in resp["data"]:
+                            if int(_r["jybz"]):
+                                _cal.append(pd.Timestamp(_r["jyrq"]))
+                    except Exception as e:
+                        raise ValueError(f"{month}-->{e}")
+                    return _cal
+
+                month_range = pd.date_range(start="2000-01", end=pd.Timestamp.now() + pd.Timedelta(days=31), freq="M")
+                calendar = []
+                for _m in month_range:
+                    cal = _get_calendar(_m.strftime("%Y-%m"))
+                    if cal:
+                        calendar += cal
+                calendar = list(filter(lambda x: x <= pd.Timestamp.now(), calendar))
+            else:
+                calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code])
+        _CALENDAR_MAP[bench_code] = calendar
+    logger.info(f"end of get calendar list: {bench_code}.")
+    return calendar
+
+
+def return_date_list(date_field_name: str, file_path: Path):
+    date_list = pd.read_csv(file_path, sep=",", index_col=0)[date_field_name].to_list()
+    return sorted(map(lambda x: pd.Timestamp(x), date_list))
+
+
+def get_calendar_list_by_ratio(
+    source_dir: [str, Path],
+    date_field_name: str = "date",
+    threshold: float = 0.5,
+    minimum_count: int = 10,
+    max_workers: int = 16,
+) -> list:
+    """get calendar list by selecting the date when few funds trade in this day
+
+    Parameters
+    ----------
+    source_dir: str or Path
+        The directory where the raw data collected from the Internet is saved
+    date_field_name: str
+            date field name, default is date
+    threshold: float
+        threshold to exclude some days when few funds trade in this day, default 0.5
+    minimum_count: int
+        minimum count of funds should trade in one day
+    max_workers: int
+        Concurrent number, default is 16
+
+    Returns
+    -------
+        history calendar list
+    """
+    logger.info(f"get calendar list from {source_dir} by threshold = {threshold}......")
+
+    source_dir = Path(source_dir).expanduser()
+    file_list = list(source_dir.glob("*.csv"))
+
+    _number_all_funds = len(file_list)
+
+    logger.info(f"count how many funds trade in this day......")
+    _dict_count_trade = dict()  # dict{date:count}
+    _fun = partial(return_date_list, date_field_name)
+    all_oldest_list = []
+    with tqdm(total=_number_all_funds) as p_bar:
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            for date_list in executor.map(_fun, file_list):
+                if date_list:
+                    all_oldest_list.append(date_list[0])
+                for date in date_list:
+                    if date not in _dict_count_trade.keys():
+                        _dict_count_trade[date] = 0
+
+                    _dict_count_trade[date] += 1
+
+                p_bar.update()
+
+    logger.info(f"count how many funds have founded in this day......")
+    _dict_count_founding = {date: _number_all_funds for date in _dict_count_trade.keys()}  # dict{date:count}
+    with tqdm(total=_number_all_funds) as p_bar:
+        for oldest_date in all_oldest_list:
+            for date in _dict_count_founding.keys():
+                if date < oldest_date:
+                    _dict_count_founding[date] -= 1
+
+    calendar = [
+        date
+        for date in _dict_count_trade
+        if _dict_count_trade[date] >= max(int(_dict_count_founding[date] * threshold), minimum_count)
+    ]
+
+    return calendar
+
+
+def get_hs_stock_symbols() -> list:
+    """get SH/SZ stock symbols
+
+    Returns
+    -------
+        stock symbols
+    """
+    global _HS_SYMBOLS
+
+    def _get_symbol():
+        _res = set()
+        for _k, _v in (("ha", "ss"), ("sa", "sz"), ("gem", "sz")):
+            resp = requests.get(HS_SYMBOLS_URL.format(s_type=_k))
+            _res |= set(
+                map(
+                    lambda x: "{}.{}".format(re.findall(r"\d+", x)[0], _v),
+                    etree.HTML(resp.text).xpath("//div[@class='result']/ul//li/a/text()"),
+                )
+            )
+            time.sleep(3)
+        return _res
+
+    if _HS_SYMBOLS is None:
+        symbols = set()
+        _retry = 60
+        # It may take multiple times to get the complete
+        while len(symbols) < MINIMUM_SYMBOLS_NUM:
+            symbols |= _get_symbol()
+            time.sleep(3)
+
+        symbol_cache_path = Path("~/.cache/hs_symbols_cache.pkl").expanduser().resolve()
+        symbol_cache_path.parent.mkdir(parents=True, exist_ok=True)
+        if symbol_cache_path.exists():
+            with symbol_cache_path.open("rb") as fp:
+                cache_symbols = pickle.load(fp)
+                symbols |= cache_symbols
+        with symbol_cache_path.open("wb") as fp:
+            pickle.dump(symbols, fp)
+
+        _HS_SYMBOLS = sorted(list(symbols))
+
+    return _HS_SYMBOLS
+
+
+def get_us_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
+    """get US stock symbols
+
+    Returns
+    -------
+        stock symbols
+    """
+    global _US_SYMBOLS
+
+    @deco_retry
+    def _get_eastmoney():
+        url = "http://4.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&fs=m:105,m:106,m:107&fields=f12"
+        resp = requests.get(url)
+        if resp.status_code != 200:
+            raise ValueError("request error")
+
+        try:
+            _symbols = [_v["f12"].replace("_", "-P") for _v in resp.json()["data"]["diff"].values()]
+        except Exception as e:
+            logger.warning(f"request error: {e}")
+            raise
+
+        if len(_symbols) < 8000:
+            raise ValueError("request error")
+
+        return _symbols
+
+    @deco_retry
+    def _get_nasdaq():
+        _res_symbols = []
+        for _name in ["otherlisted", "nasdaqtraded"]:
+            url = f"ftp://ftp.nasdaqtrader.com/SymbolDirectory/{_name}.txt"
+            df = pd.read_csv(url, sep="|")
+            df = df.rename(columns={"ACT Symbol": "Symbol"})
+            _symbols = df["Symbol"].dropna()
+            _symbols = _symbols.str.replace("$", "-P", regex=False)
+            _symbols = _symbols.str.replace(".W", "-WT", regex=False)
+            _symbols = _symbols.str.replace(".U", "-UN", regex=False)
+            _symbols = _symbols.str.replace(".R", "-RI", regex=False)
+            _symbols = _symbols.str.replace(".", "-", regex=False)
+            _res_symbols += _symbols.unique().tolist()
+        return _res_symbols
+
+    @deco_retry
+    def _get_nyse():
+        url = "https://www.nyse.com/api/quotes/filter"
+        _parms = {
+            "instrumentType": "EQUITY",
+            "pageNumber": 1,
+            "sortColumn": "NORMALIZED_TICKER",
+            "sortOrder": "ASC",
+            "maxResultsPerPage": 10000,
+            "filterToken": "",
+        }
+        resp = requests.post(url, json=_parms)
+        if resp.status_code != 200:
+            raise ValueError("request error")
+
+        try:
+            _symbols = [_v["symbolTicker"].replace("-", "-P") for _v in resp.json()]
+        except Exception as e:
+            logger.warning(f"request error: {e}")
+            _symbols = []
+        return _symbols
+
+    if _US_SYMBOLS is None:
+        _all_symbols = _get_eastmoney() + _get_nasdaq() + _get_nyse()
+        if qlib_data_path is not None:
+            for _index in ["nasdaq100", "sp500"]:
+                ins_df = pd.read_csv(
+                    Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
+                    sep="\t",
+                    names=["symbol", "start_date", "end_date"],
+                )
+                _all_symbols += ins_df["symbol"].unique().tolist()
+
+        def _format(s_):
+            s_ = s_.replace(".", "-")
+            s_ = s_.strip("$")
+            s_ = s_.strip("*")
+            return s_
+
+        _US_SYMBOLS = sorted(set(map(_format, filter(lambda x: len(x) < 8 and not x.endswith("WS"), _all_symbols))))
+
+    return _US_SYMBOLS
+
+
+def get_in_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
+    """get IN stock symbols
+
+    Returns
+    -------
+        stock symbols
+    """
+    global _IN_SYMBOLS
+
+    @deco_retry
+    def _get_nifty():
+        url = f"https://www1.nseindia.com/content/equities/EQUITY_L.csv"
+        df = pd.read_csv(url)
+        df = df.rename(columns={"SYMBOL": "Symbol"})
+        df["Symbol"] = df["Symbol"] + ".NS"
+        _symbols = df["Symbol"].dropna()
+        _symbols = _symbols.unique().tolist()
+        return _symbols
+
+    if _IN_SYMBOLS is None:
+        _all_symbols = _get_nifty()
+        if qlib_data_path is not None:
+            for _index in ["nifty"]:
+                ins_df = pd.read_csv(
+                    Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
+                    sep="\t",
+                    names=["symbol", "start_date", "end_date"],
+                )
+                _all_symbols += ins_df["symbol"].unique().tolist()
+
+        def _format(s_):
+            s_ = s_.replace(".", "-")
+            s_ = s_.strip("$")
+            s_ = s_.strip("*")
+            return s_
+
+        _IN_SYMBOLS = sorted(set(_all_symbols))
+
+    return _IN_SYMBOLS
+
+
+def get_br_stock_symbols(qlib_data_path: [str, Path] = None) -> list:
+    """get Brazil(B3) stock symbols
+
+    Returns
+    -------
+        B3 stock symbols
+    """
+    global _BR_SYMBOLS
+
+    @deco_retry
+    def _get_ibovespa():
+        _symbols = []
+        url = "https://www.fundamentus.com.br/detalhes.php?papel="
+
+        # Request
+        agent = {"User-Agent": "Mozilla/5.0"}
+        page = requests.get(url, headers=agent)
+
+        # BeautifulSoup
+        soup = BeautifulSoup(page.content, "html.parser")
+        tbody = soup.find("tbody")
+
+        children = tbody.findChildren("a", recursive=True)
+        for child in children:
+            _symbols.append(str(child).split('"')[-1].split(">")[1].split("<")[0])
+
+        return _symbols
+
+    if _BR_SYMBOLS is None:
+        _all_symbols = _get_ibovespa()
+        if qlib_data_path is not None:
+            for _index in ["ibov"]:
+                ins_df = pd.read_csv(
+                    Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"),
+                    sep="\t",
+                    names=["symbol", "start_date", "end_date"],
+                )
+                _all_symbols += ins_df["symbol"].unique().tolist()
+
+        def _format(s_):
+            s_ = s_.strip()
+            s_ = s_.strip("$")
+            s_ = s_.strip("*")
+            s_ = s_ + ".SA"
+            return s_
+
+        _BR_SYMBOLS = sorted(set(map(_format, _all_symbols)))
+
+    return _BR_SYMBOLS
+
+
+def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list:
+    """get en fund symbols
+
+    Returns
+    -------
+        fund symbols in China
+    """
+    global _EN_FUND_SYMBOLS
+
+    @deco_retry
+    def _get_eastmoney():
+        url = "http://fund.eastmoney.com/js/fundcode_search.js"
+        resp = requests.get(url)
+        if resp.status_code != 200:
+            raise ValueError("request error")
+        try:
+            _symbols = []
+            for sub_data in re.findall(r"[\[](.*?)[\]]", resp.content.decode().split("= [")[-1].replace("];", "")):
+                data = sub_data.replace('"', "").replace("'", "")
+                # TODO: do we need other information, like fund_name from ['000001', 'HXCZHH', '华夏成长混合', '混合型', 'HUAXIACHENGZHANGHUNHE']
+                _symbols.append(data.split(",")[0])
+        except Exception as e:
+            logger.warning(f"request error: {e}")
+            raise
+        if len(_symbols) < 8000:
+            raise ValueError("request error")
+        return _symbols
+
+    if _EN_FUND_SYMBOLS is None:
+        _all_symbols = _get_eastmoney()
+
+        _EN_FUND_SYMBOLS = sorted(set(_all_symbols))
+
+    return _EN_FUND_SYMBOLS
+
+
+def symbol_suffix_to_prefix(symbol: str, capital: bool = True) -> str:
+    """symbol suffix to prefix
+
+    Parameters
+    ----------
+    symbol: str
+        symbol
+    capital : bool
+        by default True
+    Returns
+    -------
+
+    """
+    code, exchange = symbol.split(".")
+    if exchange.lower() in ["sh", "ss"]:
+        res = f"sh{code}"
+    else:
+        res = f"{exchange}{code}"
+    return res.upper() if capital else res.lower()
+
+
+def symbol_prefix_to_sufix(symbol: str, capital: bool = True) -> str:
+    """symbol prefix to sufix
+
+    Parameters
+    ----------
+    symbol: str
+        symbol
+    capital : bool
+        by default True
+    Returns
+    -------
+
+    """
+    res = f"{symbol[:-2]}.{symbol[-2:]}"
+    return res.upper() if capital else res.lower()
+
+
+def deco_retry(retry: int = 5, retry_sleep: int = 3):
+    def deco_func(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            _retry = 5 if callable(retry) else retry
+            _result = None
+            for _i in range(1, _retry + 1):
+                try:
+                    _result = func(*args, **kwargs)
+                    break
+
+                except Exception as e:
+                    logger.warning(f"{func.__name__}: {_i} :{e}")
+                    if _i == _retry:
+                        raise
+
+                time.sleep(retry_sleep)
+            return _result
+
+        return wrapper
+
+    return deco_func(retry) if callable(retry) else deco_func
+
+
+def get_trading_date_by_shift(trading_list: list, trading_date: pd.Timestamp, shift: int = 1):
+    """get trading date by shift
+
+    Parameters
+    ----------
+    trading_list: list
+        trading calendar list
+    shift : int
+        shift, default is 1
+
+    trading_date : pd.Timestamp
+        trading date
+    Returns
+    -------
+
+    """
+    trading_date = pd.Timestamp(trading_date)
+    left_index = bisect.bisect_left(trading_list, trading_date)
+    try:
+        res = trading_list[left_index + shift]
+    except IndexError:
+        res = trading_date
+    return res
+
+
+def generate_minutes_calendar_from_daily(
+    calendars: Iterable,
+    freq: str = "1min",
+    am_range: Tuple[str, str] = ("09:30:00", "11:29:00"),
+    pm_range: Tuple[str, str] = ("13:00:00", "14:59:00"),
+) -> pd.Index:
+    """generate minutes calendar
+
+    Parameters
+    ----------
+    calendars: Iterable
+        daily calendar
+    freq: str
+        by default 1min
+    am_range: Tuple[str, str]
+        AM Time Range, by default China-Stock: ("09:30:00", "11:29:00")
+    pm_range: Tuple[str, str]
+        PM Time Range, by default China-Stock: ("13:00:00", "14:59:00")
+
+    """
+    daily_format: str = "%Y-%m-%d"
+    res = []
+    for _day in calendars:
+        for _range in [am_range, pm_range]:
+            res.append(
+                pd.date_range(
+                    f"{pd.Timestamp(_day).strftime(daily_format)} {_range[0]}",
+                    f"{pd.Timestamp(_day).strftime(daily_format)} {_range[1]}",
+                    freq=freq,
+                )
+            )
+
+    return pd.Index(sorted(set(np.hstack(res))))
+
+
+def get_instruments(
+    qlib_dir: str,
+    index_name: str,
+    method: str = "parse_instruments",
+    freq: str = "day",
+    request_retry: int = 5,
+    retry_sleep: int = 3,
+    market_index: str = "cn_index",
+):
+    """
+
+    Parameters
+    ----------
+    qlib_dir: str
+        qlib data dir, default "Path(__file__).parent/qlib_data"
+    index_name: str
+        index name, value from ["csi100", "csi300"]
+    method: str
+        method, value from ["parse_instruments", "save_new_companies"]
+    freq: str
+        freq, value from ["day", "1min"]
+    request_retry: int
+        request retry, by default 5
+    retry_sleep: int
+        request sleep, by default 3
+    market_index: str
+        Where the files to obtain the index are located,
+        for example data_collector.cn_index.collector
+
+    Examples
+    -------
+        # parse instruments
+        $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments
+
+        # parse new companies
+        $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies
+
+    """
+    _cur_module = importlib.import_module("data_collector.{}.collector".format(market_index))
+    obj = getattr(_cur_module, f"{index_name.upper()}Index")(
+        qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep
+    )
+    getattr(obj, method)()
+
+
+if __name__ == "__main__":
+    assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM
--- a/src/qlib_scripts/data_collector/yahoo/README.md
+++ b/src/qlib_scripts/data_collector/yahoo/README.md
@ -0,0 +1,220 @@
+
+- [Collector Data](#collector-data)
+  - [Get Qlib data](#get-qlib-databin-file)
+  - [Collector *YahooFinance* data to qlib](#collector-yahoofinance-data-to-qlib)
+  - [Automatic update of daily frequency data](#automatic-update-of-daily-frequency-datafrom-yahoo-finance)
+- [Using qlib data](#using-qlib-data)
+
+
+# Collect Data From Yahoo Finance
+
+> *Please pay **ATTENTION** that the data is collected from [Yahoo Finance](https://finance.yahoo.com/lookup) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
+
+**NOTE**:  Yahoo! Finance has blocked the access from China. Please change your network if you want to use the Yahoo data crawler.
+
+>  **Examples of abnormal data**
+
+- [SH000661](https://finance.yahoo.com/quote/000661.SZ/history?period1=1558310400&period2=1590796800&interval=1d&filter=history&frequency=1d)
+- [SZ300144](https://finance.yahoo.com/quote/300144.SZ/history?period1=1557446400&period2=1589932800&interval=1d&filter=history&frequency=1d)
+
+We have considered **STOCK PRICE ADJUSTMENT**, but some price series seem still very abnormal.
+
+## Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+## Collector Data
+
+### Get Qlib data(`bin file`)
+  > `qlib-data` from *YahooFinance*, is the data that has been dumped and can be used directly in `qlib`.
+  > This ready-made qlib-data is not updated regularly. If users want the latest data, please follow [these steps](#collector-yahoofinance-data-to-qlib) download the latest data. 
+
+  - get data: `python scripts/get_data.py qlib_data`
+  - parameters:
+    - `target_dir`: save dir, by default *~/.qlib/qlib_data/cn_data*
+    - `version`: dataset version, value from [`v1`, `v2`], by default `v1`
+      - `v2` end date is *2021-06*, `v1` end date is *2020-09*
+      - user can append data to `v2`: [automatic update of daily frequency data](#automatic-update-of-daily-frequency-datafrom-yahoo-finance)
+      - **the [benchmarks](https://github.com/microsoft/qlib/tree/main/examples/benchmarks) for qlib use `v1`**, *due to the unstable access to historical data by YahooFinance, there are some differences between `v2` and `v1`*
+    - `interval`: `1d` or `1min`, by default `1d`
+    - `region`: `cn` or `us` or `in`, by default `cn`
+    - `delete_old`: delete existing data from `target_dir`(*features, calendars, instruments, dataset_cache, features_cache*), value from [`True`, `False`], by default `True`
+    - `exists_skip`: traget_dir data already exists, skip `get_data`, value from [`True`, `False`], by default `False`
+  - examples:
+    ```bash
+    # cn 1d
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
+    # cn 1min
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --region cn --interval 1min
+    # us 1d
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/us_data --region us --interval 1d
+    # us 1min
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/us_data_1min --region us --interval 1min
+    # in 1d
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/in_data --region in --interval 1d
+    # in 1min
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/in_data_1min --region in --interval 1min
+    ```
+
+### Collector *YahooFinance* data to qlib
+> collector *YahooFinance* data and *dump* into `qlib` format.
+> If the above ready-made data can't meet users' requirements,  users can follow this section to crawl the latest data and convert it to qlib-data.
+  1. download data to csv: `python scripts/data_collector/yahoo/collector.py download_data`
+
+     - parameters:
+          - `source_dir`: save the directory
+          - `interval`: `1d` or `1min`, by default `1d`
+            > **due to the limitation of the *YahooFinance API*, only the last month's data is available in `1min`**
+          - `region`: `CN` or `US` or `IN` or `BR`, by default `CN`
+          - `delay`: `time.sleep(delay)`, by default *0.5*
+          - `start`: start datetime, by default *"2000-01-01"*; *closed interval(including start)*
+          - `end`: end datetime, by default `pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))`; *open interval(excluding end)*
+          - `max_workers`: get the number of concurrent symbols, it is not recommended to change this parameter in order to maintain the integrity of the symbol data, by default *1*
+          - `check_data_length`: check the number of rows per *symbol*, by default `None`
+            > if `len(symbol_df) < check_data_length`, it will be re-fetched, with the number of re-fetches coming from the `max_collector_count` parameter
+          - `max_collector_count`: number of *"failed"* symbol retries, by default 2
+     - examples:
+          ```bash
+          # cn 1d data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region CN
+          # cn 1min data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/cn_data_1min --delay 1 --interval 1min --region CN
+
+          # us 1d data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region US
+          # us 1min data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/us_data_1min --delay 1 --interval 1min --region US
+
+          # in 1d data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/in_data --start 2020-01-01 --end 2020-12-31 --delay 1 --interval 1d --region IN
+          # in 1min data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/in_data_1min --delay 1 --interval 1min --region IN
+
+          # br 1d data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data --start 2003-01-03 --end 2022-03-01 --delay 1 --interval 1d --region BR
+          # br 1min data
+          python collector.py download_data --source_dir ~/.qlib/stock_data/source/br_data_1min --delay 1 --interval 1min --region BR
+          ```
+  2. normalize data: `python scripts/data_collector/yahoo/collector.py normalize_data`
+     
+     - parameters:
+          - `source_dir`: csv directory
+          - `normalize_dir`: result directory
+          - `max_workers`: number of concurrent, by default *1*
+          - `interval`: `1d` or `1min`, by default `1d`
+            > if **`interval == 1min`**, `qlib_data_1d_dir` cannot be `None`
+          - `region`: `CN` or `US` or `IN`, by default `CN`
+          - `date_field_name`: column *name* identifying time in csv files, by default `date`
+          - `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
+          - `end_date`: if not `None`, normalize the last date saved (*including end_date*); if `None`, it will ignore this parameter; by default `None`
+          - `qlib_data_1d_dir`: qlib directory(1d data)
+            ```
+            if interval==1min, qlib_data_1d_dir cannot be None, normalize 1min needs to use 1d data;
+        
+                qlib_data_1d can be obtained like this:
+                    $ python scripts/get_data.py qlib_data --target_dir <qlib_data_1d_dir> --interval 1d
+                    $ python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <qlib_data_1d_dir> --trading_date 2021-06-01
+                or:
+                    download 1d data from YahooFinance
+            
+            ```
+      - examples:
+        ```bash
+        # normalize 1d cn
+        python collector.py normalize_data --source_dir ~/.qlib/stock_data/source/cn_data --normalize_dir ~/.qlib/stock_data/source/cn_1d_nor --region CN --interval 1d
+
+        # normalize 1min cn
+        python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/cn_data --source_dir ~/.qlib/stock_data/source/cn_data_1min --normalize_dir ~/.qlib/stock_data/source/cn_1min_nor --region CN --interval 1min
+
+        # normalize 1d br
+        python scripts/data_collector/yahoo/collector.py normalize_data --source_dir ~/.qlib/stock_data/source/br_data --normalize_dir ~/.qlib/stock_data/source/br_1d_nor --region BR --interval 1d
+
+        # normalize 1min br
+        python collector.py normalize_data --qlib_data_1d_dir ~/.qlib/qlib_data/br_data --source_dir ~/.qlib/stock_data/source/br_data_1min --normalize_dir ~/.qlib/stock_data/source/br_1min_nor --region BR --interval 1min
+        ```
+  3. dump data: `python scripts/dump_bin.py dump_all`
+    
+     - parameters:
+       - `csv_path`: stock data path or directory, **normalize result(normalize_dir)**
+       - `qlib_dir`: qlib(dump) data director
+       - `freq`: transaction frequency, by default `day`
+         > `freq_map = {1d:day, 1mih: 1min}`
+       - `max_workers`: number of threads, by default *16*
+       - `include_fields`: dump fields, by default `""`
+       - `exclude_fields`: fields not dumped, by default `"""
+         > dump_fields = `include_fields if include_fields else set(symbol_df.columns) - set(exclude_fields) exclude_fields else symbol_df.columns`
+       - `symbol_field_name`: column *name* identifying symbol in csv files, by default `symbol`
+       - `date_field_name`: column *name* identifying time in csv files, by default `date`
+     - examples:
+       ```bash
+       # dump 1d cn
+       python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1d_nor --qlib_dir ~/.qlib/qlib_data/cn_data --freq day --exclude_fields date,symbol
+       # dump 1min cn
+       python dump_bin.py dump_all --csv_path ~/.qlib/stock_data/source/cn_1min_nor --qlib_dir ~/.qlib/qlib_data/cn_data_1min --freq 1min --exclude_fields date,symbol
+       ```
+
+### Automatic update of daily frequency data(from yahoo finance)
+  > It is recommended that users update the data manually once (--trading_date 2021-05-25) and then set it to update automatically.
+
+  * Automatic update of data to the "qlib" directory each trading day(Linux)
+      * use *crontab*: `crontab -e`
+      * set up timed tasks:
+
+        ```
+        * * * * 1-5 python <script path> update_data_to_bin --qlib_data_1d_dir <user data dir>
+        ```
+        * **script path**: *scripts/data_collector/yahoo/collector.py*
+
+  * Manual update of data
+      ```
+      python scripts/data_collector/yahoo/collector.py update_data_to_bin --qlib_data_1d_dir <user data dir> --trading_date <start date> --end_date <end date>
+      ```
+      * `trading_date`: start of trading day
+      * `end_date`: end of trading day(not included)
+      * `check_data_length`: check the number of rows per *symbol*, by default `None`
+        > if `len(symbol_df) < check_data_length`, it will be re-fetched, with the number of re-fetches coming from the `max_collector_count` parameter
+
+  * `scripts/data_collector/yahoo/collector.py update_data_to_bin` parameters:
+      * `source_dir`: The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
+      * `normalize_dir`: Directory for normalize data, default "Path(__file__).parent/normalize"
+      * `qlib_data_1d_dir`: the qlib data to be updated for yahoo, usually from: [download qlib data](https://github.com/microsoft/qlib/tree/main/scripts#download-cn-data)
+      * `trading_date`: trading days to be updated, by default ``datetime.datetime.now().strftime("%Y-%m-%d")``
+      * `end_date`: end datetime, default ``pd.Timestamp(trading_date + pd.Timedelta(days=1))``; open interval(excluding end)
+      * `region`: region, value from ["CN", "US"], default "CN"
+
+
+## Using qlib data
+
+  ```python
+  import qlib
+  from qlib.data import D
+
+  # 1d data cn
+  # freq=day, freq default day
+  qlib.init(provider_uri="~/.qlib/qlib_data/cn_data", region="cn")
+  df = D.features(D.instruments("all"), ["$close"], freq="day")
+
+  # 1min data cn
+  # freq=1min
+  qlib.init(provider_uri="~/.qlib/qlib_data/cn_data_1min", region="cn")
+  inst = D.list_instruments(D.instruments("all"), freq="1min", as_list=True)
+  # get 100 symbols
+  df = D.features(inst[:100], ["$close"], freq="1min")
+  # get all symbol data
+  # df = D.features(D.instruments("all"), ["$close"], freq="1min")
+
+  # 1d data us
+  qlib.init(provider_uri="~/.qlib/qlib_data/us_data", region="us")
+  df = D.features(D.instruments("all"), ["$close"], freq="day")
+
+  # 1min data us
+  qlib.init(provider_uri="~/.qlib/qlib_data/us_data_1min", region="cn")
+  inst = D.list_instruments(D.instruments("all"), freq="1min", as_list=True)
+  # get 100 symbols
+  df = D.features(inst[:100], ["$close"], freq="1min")
+  # get all symbol data
+  # df = D.features(D.instruments("all"), ["$close"], freq="1min")
+  ```
+
--- a/src/qlib_scripts/data_collector/yahoo/collector.py
+++ b/src/qlib_scripts/data_collector/yahoo/collector.py
--- a/src/qlib_scripts/data_collector/yahoo/requirements.txt
+++ b/src/qlib_scripts/data_collector/yahoo/requirements.txt
@ -0,0 +1,12 @@
+loguru
+fire
+requests
+numpy
+pandas
+tqdm
+lxml
+yahooquery
+joblib
+beautifulsoup4
+bs4
+soupsieve
--- a/src/qlib_scripts/dump_bin.py
+++ b/src/qlib_scripts/dump_bin.py
@ -0,0 +1,504 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import abc
+import shutil
+import traceback
+from pathlib import Path
+from typing import Iterable, List, Union
+from functools import partial
+from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
+
+import fire
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from loguru import logger
+from qlib.utils import fname_to_code, code_to_fname
+
+
+class DumpDataBase:
+    INSTRUMENTS_START_FIELD = "start_datetime"
+    INSTRUMENTS_END_FIELD = "end_datetime"
+    CALENDARS_DIR_NAME = "calendars"
+    FEATURES_DIR_NAME = "features"
+    INSTRUMENTS_DIR_NAME = "instruments"
+    DUMP_FILE_SUFFIX = ".bin"
+    DAILY_FORMAT = "%Y-%m-%d"
+    HIGH_FREQ_FORMAT = "%Y-%m-%d %H:%M:%S"
+    INSTRUMENTS_SEP = "\t"
+    INSTRUMENTS_FILE_NAME = "all.txt"
+
+    UPDATE_MODE = "update"
+    ALL_MODE = "all"
+
+    def __init__(
+        self,
+        csv_path: str,
+        qlib_dir: str,
+        backup_dir: str = None,
+        freq: str = "day",
+        max_workers: int = 16,
+        date_field_name: str = "date",
+        file_suffix: str = ".csv",
+        symbol_field_name: str = "symbol",
+        exclude_fields: str = "",
+        include_fields: str = "",
+        limit_nums: int = None,
+    ):
+        """
+
+        Parameters
+        ----------
+        csv_path: str
+            stock data path or directory
+        qlib_dir: str
+            qlib(dump) data director
+        backup_dir: str, default None
+            if backup_dir is not None, backup qlib_dir to backup_dir
+        freq: str, default "day"
+            transaction frequency
+        max_workers: int, default None
+            number of threads
+        date_field_name: str, default "date"
+            the name of the date field in the csv
+        file_suffix: str, default ".csv"
+            file suffix
+        symbol_field_name: str, default "symbol"
+            symbol field name
+        include_fields: tuple
+            dump fields
+        exclude_fields: tuple
+            fields not dumped
+        limit_nums: int
+            Use when debugging, default None
+        """
+        csv_path = Path(csv_path).expanduser()
+        if isinstance(exclude_fields, str):
+            exclude_fields = exclude_fields.split(",")
+        if isinstance(include_fields, str):
+            include_fields = include_fields.split(",")
+        self._exclude_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, exclude_fields)))
+        self._include_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, include_fields)))
+        self.file_suffix = file_suffix
+        self.symbol_field_name = symbol_field_name
+        self.csv_files = sorted(csv_path.glob(f"*{self.file_suffix}") if csv_path.is_dir() else [csv_path])
+        if limit_nums is not None:
+            self.csv_files = self.csv_files[: int(limit_nums)]
+        self.qlib_dir = Path(qlib_dir).expanduser()
+        self.backup_dir = backup_dir if backup_dir is None else Path(backup_dir).expanduser()
+        if backup_dir is not None:
+            self._backup_qlib_dir(Path(backup_dir).expanduser())
+
+        self.freq = freq
+        self.calendar_format = self.DAILY_FORMAT if self.freq == "day" else self.HIGH_FREQ_FORMAT
+
+        self.works = max_workers
+        self.date_field_name = date_field_name
+
+        self._calendars_dir = self.qlib_dir.joinpath(self.CALENDARS_DIR_NAME)
+        self._features_dir = self.qlib_dir.joinpath(self.FEATURES_DIR_NAME)
+        self._instruments_dir = self.qlib_dir.joinpath(self.INSTRUMENTS_DIR_NAME)
+
+        self._calendars_list = []
+
+        self._mode = self.ALL_MODE
+        self._kwargs = {}
+
+    def _backup_qlib_dir(self, target_dir: Path):
+        shutil.copytree(str(self.qlib_dir.resolve()), str(target_dir.resolve()))
+
+    def _format_datetime(self, datetime_d: [str, pd.Timestamp]):
+        datetime_d = pd.Timestamp(datetime_d)
+        return datetime_d.strftime(self.calendar_format)
+
+    def _get_date(
+        self, file_or_df: [Path, pd.DataFrame], *, is_begin_end: bool = False, as_set: bool = False
+    ) -> Iterable[pd.Timestamp]:
+        if not isinstance(file_or_df, pd.DataFrame):
+            df = self._get_source_data(file_or_df)
+        else:
+            df = file_or_df
+        if df.empty or self.date_field_name not in df.columns.tolist():
+            _calendars = pd.Series(dtype=np.float32)
+        else:
+            _calendars = df[self.date_field_name]
+
+        if is_begin_end and as_set:
+            return (_calendars.min(), _calendars.max()), set(_calendars)
+        elif is_begin_end:
+            return _calendars.min(), _calendars.max()
+        elif as_set:
+            return set(_calendars)
+        else:
+            return _calendars.tolist()
+
+    def _get_source_data(self, file_path: Path) -> pd.DataFrame:
+        df = pd.read_csv(str(file_path.resolve()), low_memory=False)
+        df[self.date_field_name] = df[self.date_field_name].astype(str).astype(np.datetime64)
+        # df.drop_duplicates([self.date_field_name], inplace=True)
+        return df
+
+    def get_symbol_from_file(self, file_path: Path) -> str:
+        return fname_to_code(file_path.name[: -len(self.file_suffix)].strip().lower())
+
+    def get_dump_fields(self, df_columns: Iterable[str]) -> Iterable[str]:
+        return (
+            self._include_fields
+            if self._include_fields
+            else set(df_columns) - set(self._exclude_fields)
+            if self._exclude_fields
+            else df_columns
+        )
+
+    @staticmethod
+    def _read_calendars(calendar_path: Path) -> List[pd.Timestamp]:
+        return sorted(
+            map(
+                pd.Timestamp,
+                pd.read_csv(calendar_path, header=None).loc[:, 0].tolist(),
+            )
+        )
+
+    def _read_instruments(self, instrument_path: Path) -> pd.DataFrame:
+        df = pd.read_csv(
+            instrument_path,
+            sep=self.INSTRUMENTS_SEP,
+            names=[
+                self.symbol_field_name,
+                self.INSTRUMENTS_START_FIELD,
+                self.INSTRUMENTS_END_FIELD,
+            ],
+        )
+
+        return df
+
+    def save_calendars(self, calendars_data: list):
+        self._calendars_dir.mkdir(parents=True, exist_ok=True)
+        calendars_path = str(self._calendars_dir.joinpath(f"{self.freq}.txt").expanduser().resolve())
+        result_calendars_list = list(map(lambda x: self._format_datetime(x), calendars_data))
+        np.savetxt(calendars_path, result_calendars_list, fmt="%s", encoding="utf-8")
+
+    def save_instruments(self, instruments_data: Union[list, pd.DataFrame]):
+        self._instruments_dir.mkdir(parents=True, exist_ok=True)
+        instruments_path = str(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME).resolve())
+        if isinstance(instruments_data, pd.DataFrame):
+            _df_fields = [self.symbol_field_name, self.INSTRUMENTS_START_FIELD, self.INSTRUMENTS_END_FIELD]
+            instruments_data = instruments_data.loc[:, _df_fields]
+            instruments_data[self.symbol_field_name] = instruments_data[self.symbol_field_name].apply(
+                lambda x: fname_to_code(x.lower()).upper()
+            )
+            instruments_data.to_csv(instruments_path, header=False, sep=self.INSTRUMENTS_SEP, index=False)
+        else:
+            np.savetxt(instruments_path, instruments_data, fmt="%s", encoding="utf-8")
+
+    def data_merge_calendar(self, df: pd.DataFrame, calendars_list: List[pd.Timestamp]) -> pd.DataFrame:
+        # calendars
+        calendars_df = pd.DataFrame(data=calendars_list, columns=[self.date_field_name])
+        calendars_df[self.date_field_name] = calendars_df[self.date_field_name].astype(np.datetime64)
+        cal_df = calendars_df[
+            (calendars_df[self.date_field_name] >= df[self.date_field_name].min())
+            & (calendars_df[self.date_field_name] <= df[self.date_field_name].max())
+        ]
+        # align index
+        cal_df.set_index(self.date_field_name, inplace=True)
+        df.set_index(self.date_field_name, inplace=True)
+        r_df = df.reindex(cal_df.index)
+        return r_df
+
+    @staticmethod
+    def get_datetime_index(df: pd.DataFrame, calendar_list: List[pd.Timestamp]) -> int:
+        return calendar_list.index(df.index.min())
+
+    def _data_to_bin(self, df: pd.DataFrame, calendar_list: List[pd.Timestamp], features_dir: Path):
+        if df.empty:
+            logger.warning(f"{features_dir.name} data is None or empty")
+            return
+        if not calendar_list:
+            logger.warning("calendar_list is empty")
+            return
+        # align index
+        _df = self.data_merge_calendar(df, calendar_list)
+        # used when creating a bin file
+        date_index = self.get_datetime_index(_df, calendar_list)
+        for field in self.get_dump_fields(_df.columns):
+            bin_path = features_dir.joinpath(f"{field.lower()}.{self.freq}{self.DUMP_FILE_SUFFIX}")
+            if field not in _df.columns:
+                continue
+            if bin_path.exists() and self._mode == self.UPDATE_MODE:
+                # update
+                with bin_path.open("ab") as fp:
+                    np.array(_df[field]).astype("<f").tofile(fp)
+            else:
+                # append; self._mode == self.ALL_MODE or not bin_path.exists()
+                np.hstack([date_index, _df[field]]).astype("<f").tofile(str(bin_path.resolve()))
+
+    def _dump_bin(self, file_or_data: [Path, pd.DataFrame], calendar_list: List[pd.Timestamp]):
+        if not calendar_list:
+            logger.warning("calendar_list is empty")
+            return
+        if isinstance(file_or_data, pd.DataFrame):
+            if file_or_data.empty:
+                return
+            code = fname_to_code(str(file_or_data.iloc[0][self.symbol_field_name]).lower())
+            df = file_or_data
+        elif isinstance(file_or_data, Path):
+            code = self.get_symbol_from_file(file_or_data)
+            df = self._get_source_data(file_or_data)
+        else:
+            raise ValueError(f"not support {type(file_or_data)}")
+        if df is None or df.empty:
+            logger.warning(f"{code} data is None or empty")
+            return
+
+        # try to remove dup rows or it will cause exception when reindex.
+        df = df.drop_duplicates(self.date_field_name)
+
+        # features save dir
+        features_dir = self._features_dir.joinpath(code_to_fname(code).lower())
+        features_dir.mkdir(parents=True, exist_ok=True)
+        self._data_to_bin(df, calendar_list, features_dir)
+
+    @abc.abstractmethod
+    def dump(self):
+        raise NotImplementedError("dump not implemented!")
+
+    def __call__(self, *args, **kwargs):
+        self.dump()
+
+
+class DumpDataAll(DumpDataBase):
+    def _get_all_date(self):
+        logger.info("start get all date......")
+        all_datetime = set()
+        date_range_list = []
+        _fun = partial(self._get_date, as_set=True, is_begin_end=True)
+        with tqdm(total=len(self.csv_files)) as p_bar:
+            with ProcessPoolExecutor(max_workers=self.works) as executor:
+                for file_path, ((_begin_time, _end_time), _set_calendars) in zip(
+                    self.csv_files, executor.map(_fun, self.csv_files)
+                ):
+                    all_datetime = all_datetime | _set_calendars
+                    if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
+                        _begin_time = self._format_datetime(_begin_time)
+                        _end_time = self._format_datetime(_end_time)
+                        symbol = self.get_symbol_from_file(file_path)
+                        _inst_fields = [symbol.upper(), _begin_time, _end_time]
+                        date_range_list.append(f"{self.INSTRUMENTS_SEP.join(_inst_fields)}")
+                    p_bar.update()
+        self._kwargs["all_datetime_set"] = all_datetime
+        self._kwargs["date_range_list"] = date_range_list
+        logger.info("end of get all date.\n")
+
+    def _dump_calendars(self):
+        logger.info("start dump calendars......")
+        self._calendars_list = sorted(map(pd.Timestamp, self._kwargs["all_datetime_set"]))
+        self.save_calendars(self._calendars_list)
+        logger.info("end of calendars dump.\n")
+
+    def _dump_instruments(self):
+        logger.info("start dump instruments......")
+        self.save_instruments(self._kwargs["date_range_list"])
+        logger.info("end of instruments dump.\n")
+
+    def _dump_features(self):
+        logger.info("start dump features......")
+        _dump_func = partial(self._dump_bin, calendar_list=self._calendars_list)
+        with tqdm(total=len(self.csv_files)) as p_bar:
+            with ProcessPoolExecutor(max_workers=self.works) as executor:
+                for _ in executor.map(_dump_func, self.csv_files):
+                    p_bar.update()
+
+        logger.info("end of features dump.\n")
+
+    def dump(self):
+        self._get_all_date()
+        self._dump_calendars()
+        self._dump_instruments()
+        self._dump_features()
+
+
+class DumpDataFix(DumpDataAll):
+    def _dump_instruments(self):
+        logger.info("start dump instruments......")
+        _fun = partial(self._get_date, is_begin_end=True)
+        new_stock_files = sorted(
+            filter(
+                lambda x: fname_to_code(x.name[: -len(self.file_suffix)].strip().lower()).upper()
+                not in self._old_instruments,
+                self.csv_files,
+            )
+        )
+        with tqdm(total=len(new_stock_files)) as p_bar:
+            with ProcessPoolExecutor(max_workers=self.works) as execute:
+                for file_path, (_begin_time, _end_time) in zip(new_stock_files, execute.map(_fun, new_stock_files)):
+                    if isinstance(_begin_time, pd.Timestamp) and isinstance(_end_time, pd.Timestamp):
+                        symbol = fname_to_code(self.get_symbol_from_file(file_path).lower()).upper()
+                        _dt_map = self._old_instruments.setdefault(symbol, dict())
+                        _dt_map[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_begin_time)
+                        _dt_map[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end_time)
+                    p_bar.update()
+        _inst_df = pd.DataFrame.from_dict(self._old_instruments, orient="index")
+        _inst_df.index.names = [self.symbol_field_name]
+        self.save_instruments(_inst_df.reset_index())
+        logger.info("end of instruments dump.\n")
+
+    def dump(self):
+        self._calendars_list = self._read_calendars(self._calendars_dir.joinpath(f"{self.freq}.txt"))
+        # noinspection PyAttributeOutsideInit
+        self._old_instruments = (
+            self._read_instruments(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME))
+            .set_index([self.symbol_field_name])
+            .to_dict(orient="index")
+        )  # type: dict
+        self._dump_instruments()
+        self._dump_features()
+
+
+class DumpDataUpdate(DumpDataBase):
+    def __init__(
+        self,
+        csv_path: str,
+        qlib_dir: str,
+        backup_dir: str = None,
+        freq: str = "day",
+        max_workers: int = 16,
+        date_field_name: str = "date",
+        file_suffix: str = ".csv",
+        symbol_field_name: str = "symbol",
+        exclude_fields: str = "",
+        include_fields: str = "",
+        limit_nums: int = None,
+    ):
+        """
+
+        Parameters
+        ----------
+        csv_path: str
+            stock data path or directory
+        qlib_dir: str
+            qlib(dump) data director
+        backup_dir: str, default None
+            if backup_dir is not None, backup qlib_dir to backup_dir
+        freq: str, default "day"
+            transaction frequency
+        max_workers: int, default None
+            number of threads
+        date_field_name: str, default "date"
+            the name of the date field in the csv
+        file_suffix: str, default ".csv"
+            file suffix
+        symbol_field_name: str, default "symbol"
+            symbol field name
+        include_fields: tuple
+            dump fields
+        exclude_fields: tuple
+            fields not dumped
+        limit_nums: int
+            Use when debugging, default None
+        """
+        super().__init__(
+            csv_path,
+            qlib_dir,
+            backup_dir,
+            freq,
+            max_workers,
+            date_field_name,
+            file_suffix,
+            symbol_field_name,
+            exclude_fields,
+            include_fields,
+        )
+        self._mode = self.UPDATE_MODE
+        self._old_calendar_list = self._read_calendars(self._calendars_dir.joinpath(f"{self.freq}.txt"))
+        # NOTE: all.txt only exists once for each stock
+        # NOTE: if a stock corresponds to multiple different time ranges, user need to modify self._update_instruments
+        self._update_instruments = (
+            self._read_instruments(self._instruments_dir.joinpath(self.INSTRUMENTS_FILE_NAME))
+            .set_index([self.symbol_field_name])
+            .to_dict(orient="index")
+        )  # type: dict
+
+        # load all csv files
+        self._all_data = self._load_all_source_data()  # type: pd.DataFrame
+        self._new_calendar_list = self._old_calendar_list + sorted(
+            filter(lambda x: x > self._old_calendar_list[-1], self._all_data[self.date_field_name].unique())
+        )
+
+    def _load_all_source_data(self):
+        # NOTE: Need more memory
+        logger.info("start load all source data....")
+        all_df = []
+
+        def _read_csv(file_path: Path):
+            _df = pd.read_csv(file_path, parse_dates=[self.date_field_name])
+            if self.symbol_field_name not in _df.columns:
+                _df[self.symbol_field_name] = self.get_symbol_from_file(file_path)
+            return _df
+
+        with tqdm(total=len(self.csv_files)) as p_bar:
+            with ThreadPoolExecutor(max_workers=self.works) as executor:
+                for df in executor.map(_read_csv, self.csv_files):
+                    if not df.empty:
+                        all_df.append(df)
+                    p_bar.update()
+
+        logger.info("end of load all data.\n")
+        return pd.concat(all_df, sort=False)
+
+    def _dump_calendars(self):
+        pass
+
+    def _dump_instruments(self):
+        pass
+
+    def _dump_features(self):
+        logger.info("start dump features......")
+        error_code = {}
+        with ProcessPoolExecutor(max_workers=self.works) as executor:
+            futures = {}
+            for _code, _df in self._all_data.groupby(self.symbol_field_name):
+                _code = fname_to_code(str(_code).lower()).upper()
+                _start, _end = self._get_date(_df, is_begin_end=True)
+                if not (isinstance(_start, pd.Timestamp) and isinstance(_end, pd.Timestamp)):
+                    continue
+                if _code in self._update_instruments:
+                    # exists stock, will append data
+                    _update_calendars = (
+                        _df[_df[self.date_field_name] > self._update_instruments[_code][self.INSTRUMENTS_END_FIELD]][
+                            self.date_field_name
+                        ]
+                        .sort_values()
+                        .to_list()
+                    )
+                    if _update_calendars:
+                        self._update_instruments[_code][self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
+                        futures[executor.submit(self._dump_bin, _df, _update_calendars)] = _code
+                else:
+                    # new stock
+                    _dt_range = self._update_instruments.setdefault(_code, dict())
+                    _dt_range[self.INSTRUMENTS_START_FIELD] = self._format_datetime(_start)
+                    _dt_range[self.INSTRUMENTS_END_FIELD] = self._format_datetime(_end)
+                    futures[executor.submit(self._dump_bin, _df, self._new_calendar_list)] = _code
+
+            with tqdm(total=len(futures)) as p_bar:
+                for _future in as_completed(futures):
+                    try:
+                        _future.result()
+                    except Exception:
+                        error_code[futures[_future]] = traceback.format_exc()
+                    p_bar.update()
+            logger.info(f"dump bin errors： {error_code}")
+
+        logger.info("end of features dump.\n")
+
+    def dump(self):
+        self.save_calendars(self._new_calendar_list)
+        self._dump_features()
+        df = pd.DataFrame.from_dict(self._update_instruments, orient="index")
+        df.index.names = [self.symbol_field_name]
+        self.save_instruments(df.reset_index())
+
+
+if __name__ == "__main__":
+    fire.Fire({"dump_all": DumpDataAll, "dump_fix": DumpDataFix, "dump_update": DumpDataUpdate})
--- a/src/qlib_scripts/dump_pit.py
+++ b/src/qlib_scripts/dump_pit.py
@ -0,0 +1,286 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""
+TODO:
+- A more well-designed PIT database is required.
+    - seperated insert, delete, update, query operations are required.
+"""
+
+import abc
+import shutil
+import struct
+import traceback
+from pathlib import Path
+from typing import Iterable, List, Union
+from functools import partial
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+
+import fire
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from loguru import logger
+from qlib.utils import fname_to_code, code_to_fname, get_period_offset
+from qlib.config import C
+
+
+class DumpPitData:
+    PIT_DIR_NAME = "financial"
+    PIT_CSV_SEP = ","
+    DATA_FILE_SUFFIX = ".data"
+    INDEX_FILE_SUFFIX = ".index"
+
+    INTERVAL_quarterly = "quarterly"
+    INTERVAL_annual = "annual"
+
+    PERIOD_DTYPE = C.pit_record_type["period"]
+    INDEX_DTYPE = C.pit_record_type["index"]
+    DATA_DTYPE = "".join(
+        [
+            C.pit_record_type["date"],
+            C.pit_record_type["period"],
+            C.pit_record_type["value"],
+            C.pit_record_type["index"],
+        ]
+    )
+
+    NA_INDEX = C.pit_record_nan["index"]
+
+    INDEX_DTYPE_SIZE = struct.calcsize(INDEX_DTYPE)
+    PERIOD_DTYPE_SIZE = struct.calcsize(PERIOD_DTYPE)
+    DATA_DTYPE_SIZE = struct.calcsize(DATA_DTYPE)
+
+    UPDATE_MODE = "update"
+    ALL_MODE = "all"
+
+    def __init__(
+        self,
+        csv_path: str,
+        qlib_dir: str,
+        backup_dir: str = None,
+        freq: str = "quarterly",
+        max_workers: int = 16,
+        date_column_name: str = "date",
+        period_column_name: str = "period",
+        value_column_name: str = "value",
+        field_column_name: str = "field",
+        file_suffix: str = ".csv",
+        exclude_fields: str = "",
+        include_fields: str = "",
+        limit_nums: int = None,
+    ):
+        """
+
+        Parameters
+        ----------
+        csv_path: str
+            stock data path or directory
+        qlib_dir: str
+            qlib(dump) data director
+        backup_dir: str, default None
+            if backup_dir is not None, backup qlib_dir to backup_dir
+        freq: str, default "quarterly"
+            data frequency
+        max_workers: int, default None
+            number of threads
+        date_column_name: str, default "date"
+            the name of the date field in the csv
+        file_suffix: str, default ".csv"
+            file suffix
+        include_fields: tuple
+            dump fields
+        exclude_fields: tuple
+            fields not dumped
+        limit_nums: int
+            Use when debugging, default None
+        """
+        csv_path = Path(csv_path).expanduser()
+        if isinstance(exclude_fields, str):
+            exclude_fields = exclude_fields.split(",")
+        if isinstance(include_fields, str):
+            include_fields = include_fields.split(",")
+        self._exclude_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, exclude_fields)))
+        self._include_fields = tuple(filter(lambda x: len(x) > 0, map(str.strip, include_fields)))
+        self.file_suffix = file_suffix
+        self.csv_files = sorted(csv_path.glob(f"*{self.file_suffix}") if csv_path.is_dir() else [csv_path])
+        if limit_nums is not None:
+            self.csv_files = self.csv_files[: int(limit_nums)]
+        self.qlib_dir = Path(qlib_dir).expanduser()
+        self.backup_dir = backup_dir if backup_dir is None else Path(backup_dir).expanduser()
+        if backup_dir is not None:
+            self._backup_qlib_dir(Path(backup_dir).expanduser())
+
+        self.works = max_workers
+        self.date_column_name = date_column_name
+        self.period_column_name = period_column_name
+        self.value_column_name = value_column_name
+        self.field_column_name = field_column_name
+
+        self._mode = self.ALL_MODE
+
+    def _backup_qlib_dir(self, target_dir: Path):
+        shutil.copytree(str(self.qlib_dir.resolve()), str(target_dir.resolve()))
+
+    def get_source_data(self, file_path: Path) -> pd.DataFrame:
+        df = pd.read_csv(str(file_path.resolve()), low_memory=False)
+        df[self.value_column_name] = df[self.value_column_name].astype("float32")
+        df[self.date_column_name] = df[self.date_column_name].str.replace("-", "").astype("int32")
+        # df.drop_duplicates([self.date_field_name], inplace=True)
+        return df
+
+    def get_symbol_from_file(self, file_path: Path) -> str:
+        return fname_to_code(file_path.name[: -len(self.file_suffix)].strip().lower())
+
+    def get_dump_fields(self, df: Iterable[str]) -> Iterable[str]:
+        return (
+            set(self._include_fields)
+            if self._include_fields
+            else set(df[self.field_column_name]) - set(self._exclude_fields)
+            if self._exclude_fields
+            else set(df[self.field_column_name])
+        )
+
+    def get_filenames(self, symbol, field, interval):
+        dir_name = self.qlib_dir.joinpath(self.PIT_DIR_NAME, symbol)
+        dir_name.mkdir(parents=True, exist_ok=True)
+        return (
+            dir_name.joinpath(f"{field}_{interval[0]}{self.DATA_FILE_SUFFIX}".lower()),
+            dir_name.joinpath(f"{field}_{interval[0]}{self.INDEX_FILE_SUFFIX}".lower()),
+        )
+
+    def _dump_pit(
+        self,
+        file_path: str,
+        interval: str = "quarterly",
+        overwrite: bool = False,
+    ):
+        """
+        dump data as the following format:
+            `/path/to/<field>.data`
+                [date, period, value, _next]
+                [date, period, value, _next]
+                [...]
+            `/path/to/<field>.index`
+                [first_year, index, index, ...]
+
+        `<field.data>` contains the data as the point-in-time (PIT) order: `value` of `period`
+        is published at `date`, and its successive revised value can be found at `_next` (linked list).
+
+        `<field>.index` contains the index of value for each period (quarter or year). To save
+        disk space, we only store the `first_year` as its followings periods can be easily infered.
+
+        Parameters
+        ----------
+        symbol: str
+            stock symbol
+        interval: str
+            data interval
+        overwrite: bool
+            whether overwrite existing data or update only
+        """
+        symbol = self.get_symbol_from_file(file_path)
+        try:
+            df = self.get_source_data(file_path)
+        except Exception as err:
+            print('Error @', file_path)
+            print(err)
+        if df.empty:
+            logger.warning(f"{symbol} file is empty")
+            return
+        for field in self.get_dump_fields(df):
+            df_sub = df.query(f'{self.field_column_name}=="{field}"').sort_values(self.date_column_name)
+            if df_sub.empty:
+                logger.warning(f"field {field} of {symbol} is empty")
+                continue
+            data_file, index_file = self.get_filenames(symbol, field, interval)
+
+            ## calculate first & last period
+            start_year = df_sub[self.period_column_name].min()
+            end_year = df_sub[self.period_column_name].max()
+            if interval == self.INTERVAL_quarterly:
+                start_year //= 100
+                end_year //= 100
+
+            # adjust `first_year` if existing data found
+            if not overwrite and index_file.exists():
+                with open(index_file, "rb") as fi:
+                    (first_year,) = struct.unpack(self.PERIOD_DTYPE, fi.read(self.PERIOD_DTYPE_SIZE))
+                    n_years = len(fi.read()) // self.INDEX_DTYPE_SIZE
+                    if interval == self.INTERVAL_quarterly:
+                        n_years //= 4
+                    start_year = first_year + n_years
+            else:
+                with open(index_file, "wb") as f:
+                    f.write(struct.pack(self.PERIOD_DTYPE, start_year))
+                first_year = start_year
+
+            # if data already exists, continue to the next field
+            if start_year > end_year:
+                logger.warning(f"{symbol}-{field} data already exists, continue to the next field")
+                continue
+
+            # dump index filled with NA
+            with open(index_file, "ab") as fi:
+                for year in range(start_year, end_year + 1):
+                    if interval == self.INTERVAL_quarterly:
+                        fi.write(struct.pack(self.INDEX_DTYPE * 4, *[self.NA_INDEX] * 4))
+                    else:
+                        fi.write(struct.pack(self.INDEX_DTYPE, self.NA_INDEX))
+
+            # if data already exists, remove overlapped data
+            if not overwrite and data_file.exists():
+                with open(data_file, "rb") as fd:
+                    fd.seek(-self.DATA_DTYPE_SIZE, 2)
+                    last_date, _, _, _ = struct.unpack(self.DATA_DTYPE, fd.read())
+                df_sub = df_sub.query(f"{self.date_column_name}>{last_date}")
+            # otherwise,
+            # 1) truncate existing file or create a new file with `wb+` if overwrite,
+            # 2) or append existing file or create a new file with `ab+` if not overwrite
+            else:
+                with open(data_file, "wb+" if overwrite else "ab+"):
+                    pass
+
+            with open(data_file, "rb+") as fd, open(index_file, "rb+") as fi:
+
+                # update index if needed
+                for i, row in df_sub.iterrows():
+                    # get index
+                    offset = get_period_offset(first_year, row.period, interval == self.INTERVAL_quarterly)
+
+                    fi.seek(self.PERIOD_DTYPE_SIZE + self.INDEX_DTYPE_SIZE * offset)
+                    (cur_index,) = struct.unpack(self.INDEX_DTYPE, fi.read(self.INDEX_DTYPE_SIZE))
+
+                    # Case I: new data => update `_next` with current index
+                    if cur_index == self.NA_INDEX:
+                        fi.seek(self.PERIOD_DTYPE_SIZE + self.INDEX_DTYPE_SIZE * offset)
+                        fi.write(struct.pack(self.INDEX_DTYPE, fd.tell()))
+                    # Case II: previous data exists => find and update the last `_next`
+                    else:
+                        _cur_fd = fd.tell()
+                        prev_index = self.NA_INDEX
+                        while cur_index != self.NA_INDEX:  # NOTE: first iter always != NA_INDEX
+                            fd.seek(cur_index + self.DATA_DTYPE_SIZE - self.INDEX_DTYPE_SIZE)
+                            prev_index = cur_index
+                            (cur_index,) = struct.unpack(self.INDEX_DTYPE, fd.read(self.INDEX_DTYPE_SIZE))
+                        fd.seek(prev_index + self.DATA_DTYPE_SIZE - self.INDEX_DTYPE_SIZE)
+                        fd.write(struct.pack(self.INDEX_DTYPE, _cur_fd))  # NOTE: add _next pointer
+                        fd.seek(_cur_fd)
+
+                    # dump data
+                    fd.write(struct.pack(self.DATA_DTYPE, row.date, row.period, row.value, self.NA_INDEX))
+
+    def dump(self, interval="quarterly", overwrite=False):
+        logger.info("start dump pit data......")
+        _dump_func = partial(self._dump_pit, interval=interval, overwrite=overwrite)
+
+        with tqdm(total=len(self.csv_files)) as p_bar:
+            with ProcessPoolExecutor(max_workers=self.works) as executor:
+                for _ in executor.map(_dump_func, self.csv_files):
+                    p_bar.update()
+
+    def __call__(self, *args, **kwargs):
+        self.dump()
+
+
+if __name__ == "__main__":
+    fire.Fire(DumpPitData)
--- a/src/qlib_scripts/get_data.py
+++ b/src/qlib_scripts/get_data.py
@ -0,0 +1,9 @@
+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT License.
+
+import fire
+from qlib.tests.data import GetData
+
+
+if __name__ == "__main__":
+    fire.Fire(GetData)
--- a/src/refs/Connect.py
+++ b/src/refs/Connect.py
@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+import sys;
+sys.path.append('C:\\Program Files\\tinysoft\\Analyse.NET');
+import TSLPy3 as ts;
+
+
+# 登录天软
+def login():
+    if not ts.Logined():
+        a = ts.ConnectServer('tsl.tinysoft.com.cn', 443);
+        b = ts.LoginServer('fsfundsh', 'fsfund');
+        if a != 0 or b[0] != 0:
+            raise Exception("Cannot connect to tsl server");
+
+# 断开天软链接
+def logoff():
+    if ts.Logined():
+        ts.Disconnect();
+
+# 检查是否链接成功
+def logined():
+    return ts.Logined();
--- a/src/refs/Stock.py
+++ b/src/refs/Stock.py
@ -0,0 +1,279 @@
+# -*- coding: utf-8 -*-
+from BatchImports.Common import *;
+import Data.DataFeed.SqlCommonFuncs as sqlCF;
+from .Connect import *;
+import TSLPy3 as ts;
+
+
+# 从天软获取指定板块在指定日期的所有股票
+# returnType支持：int和str，str为天软格式
+# connectionSafe: 为True时，会在结束时断开连接
+def getBK(bkName, dateInt: int, returnType=str, connectionSafe=True):
+    login();
+
+    tsCode = """
+        SetSysParam(Pn_Bk(),"%s");
+        EndT:=inttodate(%d);
+        return Bk_Listed(EndT);
+    """ % (bkName, dateInt);
+
+    data = ts.RemoteExecute(tsCode, {});
+    if data[0] == 0:
+        if returnType == str:
+            stockLi = [str(s, encoding='utf-8') for s in data[1]];
+        elif returnType == int:
+            stockLi = [int(str(s, encoding='utf-8')[2:]) for s in data[1]];
+        else:
+            raise Exception("getBK暂时不支持返回此格式");
+    else:
+        stockLi = None;
+
+    if connectionSafe:
+        logoff();
+
+    return stockLi;
+
+# 从天软获取指定日期的上市所有股票
+# returnType支持：int和str，str为天软格式
+# connectionSafe: 为True时，会在结束时断开连接
+def getListedStock(dateInt: int, returnType=str, connectionSafe=True):
+    return getBK('A股', dateInt, returnType, connectionSafe);
+
+# 从天软获取交易日信息
+# returnType支持：int和datetime
+# connectionSafe: 为True时，会在结束时断开连接
+def getTradingDays(sDateInt: int, eDateInt: int, returnType=int, connectionSafe=True):
+    login();
+
+    tsCode = """
+       begt:=inttodate(%d);
+       endt:=inttodate(%d);
+       return spec(specdate(nday3(tradedays(begt,endt),DateToInt(sp_time())),endt),'SH000001');
+    """ % (sDateInt, eDateInt);
+
+    data = ts.RemoteExecute(tsCode, {});
+    if data[0] == 0:
+        if returnType == int:
+            dateLi = data[1];
+        elif returnType == datetime:
+            dateLi = [datetime.strptime(str(s), "%Y%m%d") for s in data[1]];
+        else:
+            raise Exception("getTradingDays暂时不支持返回此格式");
+    else:
+        dateLi = None;
+
+    if connectionSafe:
+        logoff();
+
+    return dateLi;
+
+# 获取指定日期的股票日K线
+def getStockDailyKLines(dateInt, stockLi=None, connectionSafe = True):
+    login();
+    if stockLi is None:
+        stockLi = getListedStock(dateInt, returnType=str, connectionSafe=False);
+
+    stockCodeArr = sqlCF.mergeStr(stockLi);
+
+
+    ## 获取基本的K线字段
+    tsCode = """
+        setsysparam(pn_cycle(),cy_day());
+        a:=array(%s);
+        t:=select ['StockID'],datetoint(['date']) as 'date',['open'],['high'],['low'],['close'], ['vol'], ['amount'], ['cjbs'],['yclose']
+            from markettable datekey %dT to %dT+0.999 of a end;
+        return t;
+    """ % (stockCodeArr, dateInt, dateInt);
+
+    # 天软去除的数据格式为list[dict[field, value]]，fieldbyte形式的，其他是数值。
+    data = ts.RemoteExecute(tsCode, {});
+
+
+    # data[0] == 0 表示返回没有错误。
+    if data[0] == 0:
+        stocks = [str(s[b'StockID'], encoding='utf-8') for s in data[1]];
+        openLi = [s[b'open'] for s in data[1]];
+        highLi = [s[b'high'] for s in data[1]];
+        lowLi = [s[b'low'] for s in data[1]];
+        closeLi = [s[b'close'] for s in data[1]];
+        volLi = [s[b'vol'] for s in data[1]];
+        amtLi = [s[b'amount'] for s in data[1]];
+        preClsLi = [s[b'yclose'] for s in data[1]];
+        tradeNumLi = [s[b'cjbs'] for s in data[1]];
+
+        klineDf = DataFrame({"Date": datetime.strptime(str(dateInt), '%Y%m%d'), "Open":openLi, "High":highLi, "Low":lowLi, "Close":closeLi, "Volume":volLi,
+                             "Amount":amtLi, "PreClose":preClsLi, "TradeNum":tradeNumLi}, index = stocks);
+    else:
+        klineDf = None;
+
+
+
+    ## 补充涨跌幅和ST字段
+    ## 获取基本的K线字段
+    tsCode2 = """
+        endT := %d;
+        setsysparam(pn_cycle(),cy_day());
+        SetSysParam(PN_Date(),inttodate(endT));
+        stocks := getAbkbydate("A股",endT);
+        a := array();
+        
+        for i:=0 to length(stocks)-1 do
+        begin
+            setsysparam(pn_stock(),stocks[i]); 
+            a[i]['StockID'] := stocks[i];
+            a[i]['PctChg'] := StockZf3();
+            a[i]['IsST'] := IsST_3(endT);
+            a[i]['FloatShares'] := StockNegotiableShares(endT);                    
+        end;
+        
+        return a;
+    """ % (dateInt);
+
+    # 天软去除的数据格式为list[dict[field, value]]，fieldbyte形式的，其他是数值。
+    data2 = ts.RemoteExecute(tsCode2, {});
+
+    if data2[0] == 0:
+        stocks = [str(s[b'StockID'], encoding='utf-8') for s in data2[1]];
+        pctChgLi = [s[b'PctChg'] for s in data2[1]];
+        stLi = [s[b'IsST'] for s in data2[1]];
+        floatSharesLi = [s[b'FloatShares'] for s in data2[1]];
+
+        supplementDf = DataFrame({"PctChg":pctChgLi, "IsST":stLi, "FloatShares":floatSharesLi}, index = stocks);
+        if klineDf is not None:
+            klineDf['PctChg'] = supplementDf['PctChg'];
+            klineDf['IsST'] = supplementDf['IsST'];
+            klineDf['TurnOver'] = klineDf['Volume'] / supplementDf['FloatShares'] * 100;
+
+
+
+    if connectionSafe:
+        logoff();
+
+    return klineDf;
+
+
+# 获取指定日期的股票1分钟K线
+def getStock1MinKLines(dateInt, stockLi=None, connectionSafe = True):
+    login();
+    if stockLi is None:
+        stockLi = getListedStock(dateInt, returnType=str, connectionSafe=False);
+
+    stockCodeArr = sqlCF.mergeStr(stockLi);
+
+
+    ## 获取基本的K线字段
+    tsCode = """
+        setsysparam(pn_cycle(), cy_1m());
+        a:=array(%s);
+        t:=select ['StockID'],DateTimeToStr(['date']) as 'time',['open'],['high'],['low'],['close'], ['vol'], ['amount'], ['yclose'] as 'AdjPreClose',
+            (['close']/['yclose'] - 1) * 100 as 'PctChg'
+            from markettable datekey %dT to %dT+0.999 of a end;
+        return t;
+    """ % (stockCodeArr, dateInt, dateInt);
+
+    #print("get data");
+    # 天软去除的数据格式为list[dict[field, value]]，fieldbyte形式的，其他是数值。
+    data = ts.RemoteExecute(tsCode, {});
+
+    #print("process data");
+    # data[0] == 0 表示返回没有错误。
+    if data[0] == 0:
+        stocks = [str(s[b'StockID'], encoding='utf-8') for s in data[1]];
+        timeLi = [datetime.strptime(str(s[b'time'], encoding='utf-8'),"%Y-%m-%d %H:%M:%S") for s in data[1]];
+        openLi = [s[b'open'] for s in data[1]];
+        highLi = [s[b'high'] for s in data[1]];
+        lowLi = [s[b'low'] for s in data[1]];
+        closeLi = [s[b'close'] for s in data[1]];
+        volLi = [s[b'vol'] for s in data[1]];
+        amtLi = [s[b'amount'] for s in data[1]];
+        preClsLi = [s[b'AdjPreClose'] for s in data[1]];
+        pctChgLi = [s[b'PctChg'] for s in data[1]];
+
+        klineDf = DataFrame({"Stock": stocks, "DateTime": timeLi, "Open":openLi, "High":highLi, "Low":lowLi, "Close":closeLi, "Volume":volLi,
+                             "Amount":amtLi, "AdjPreClose":preClsLi, "PctChg":pctChgLi});
+    else:
+        klineDf = None;
+
+    if connectionSafe:
+        logoff();
+    #print("return data");
+    return klineDf;
+
+# stock 为天软格式，如SH600000, SZ000001;
+# connectionSafe: 为True时，会在结束时断开连接
+def getSingleStock1MinKLines(stock:str, dateInt: int, connectionSafe = True):
+    login();
+
+    tsCode = """
+        SetSysParam(pn_stock(), "%s"); // 设置当前股票
+        SetSysParam(pn_cycle(), cy_1m());
+        SetSysParam(pn_date(), inttodate(%d) + 15 / 24); // 14 / 24就是15点钟。
+        N := 240; // 取240日
+        Return Nday(N, "time", DateTimeToStr(sp_time()), "Open", open(), "High", high(), "Low", low(), "Close", close(),
+         "Volume", vol(), "Amount", amount(), "AdjPreClose", Sys_PrevClose(), "PctChg", (close() / Sys_PrevClose() - 1) * 100);
+    """ % (stock, dateInt);
+
+    # 天软去除的数据格式为list[dict[field, value]]，fieldbyte形式的，其他是数值。
+    data = ts.RemoteExecute(tsCode, {});
+
+    # data[0] == 0 表示返回没有错误。
+    if data[0] == 0:
+        timeLi = [datetime.strptime(str(s[b'time'], encoding='utf-8'),"%Y-%m-%d %H:%M:%S") for s in data[1]];
+        openLi = [s[b'Open'] for s in data[1]];
+        highLi = [s[b'High'] for s in data[1]];
+        lowLi = [s[b'Low'] for s in data[1]];
+        closeLi = [s[b'Close'] for s in data[1]];
+        volLi = [s[b'Volume'] for s in data[1]];
+        amtLi = [s[b'Amount'] for s in data[1]];
+        adjPreClsLi = [s[b'AdjPreClose'] for s in data[1]];
+        pctChgLi = [s[b'PctChg'] for s in data[1]];
+
+        klineDf = DataFrame({"Stock": stock, "Time": timeLi, "Open":openLi, "High":highLi, "Low":lowLi, "Close":closeLi, "Volume":volLi,
+                             "Amount":amtLi, "AdjPreClose":adjPreClsLi, "PctChg":pctChgLi}, );
+
+        # 万一有当天不足240根K线的股票，可能会导入非当天的K线，需要检查下日期。
+        klineDf = klineDf.loc[klineDf["Time"] > datetime.strptime(str(dateInt), "%Y%m%d"),:];
+    else:
+        klineDf = None;
+
+    if connectionSafe:
+        logoff();
+
+    return klineDf;
+
+
+# 获取当天ST的股票
+def getSTStocks(dateInt, returnType=str, connectionSafe=True):
+    login();
+
+    tsCode = """
+        endT := %d;
+        stocks := getAbkbydate("A股",endT);
+        data := array();
+        n := 0;
+        for i:=0 to length(stocks)-1 do
+        begin
+           setsysparam(pn_stock(),stocks[i]);
+           if IsST_3(endt)=0 then continue;
+           data[n]:=stocks[i];
+           n++;
+        end;
+        return data;
+    """ % (dateInt);
+
+    data = ts.RemoteExecute(tsCode, {});
+    if data[0] == 0:
+        if returnType == str:
+            stockLi = [str(s, encoding='utf-8') for s in data[1]];
+        elif returnType == int:
+            stockLi = [int(str(s, encoding='utf-8')[2:]) for s in data[1]];
+        else:
+            raise Exception("getBK暂时不支持返回此格式");
+    else:
+        stockLi = None;
+
+    if connectionSafe:
+        logoff();
+
+    return stockLi;
+
--- a/src/refs/non_cur_deduct_profit.py
+++ b/src/refs/non_cur_deduct_profit.py
@ -0,0 +1,140 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[34]:
+
+
+import pandas as pd
+import numpy as np
+import os
+from datetime import datetime
+def to_last_season(datestamp):
+    '''将当前季末日期转换为上一季末日期'''
+    last_season = (pd.Series(datestamp) - pd.DateOffset(months=3)).apply(
+                    lambda x: x + pd.to_timedelta('1 days') if x.month == 3 else x)
+    return last_season
+
+def intdate(datestamp):
+    '''日期转换为8位整数'''
+    return int(datestamp.strftime('%Y%m%d'))
+
+def non_rec_deduct_profit_single_growth(df):
+    profit = df['扣非净利润'].values
+    ss1 = pd.to_datetime(df['截止日'], format='%Y%m%d') # 当季
+    ss2 = to_last_season(ss1) # 上一季
+    ss3 = ss1 - pd.DateOffset(years=1) # 上一年同季
+    ss4 = to_last_season(ss3) # 上一年上一季
+    profit_mat = [] # [当季利润,上季利润,去年利润,去年上季利润,单季利润,去年单季利润]
+    single_profit_pctchg = [] # 单季度利润同比
+    for i in range(ss1.shape[0]):
+        s1 = ss1[:i+1].max() # 当前公告日能获取的最新报告期
+        s2 = to_last_season(s1)[0]
+        s3 = s1 - pd.DateOffset(years=1)
+        s4 = to_last_season(s3)[0]
+        latest_seasons = [intdate(s1), intdate(s2), intdate(s3), intdate(s4)] #计算最新同比需要的报告期
+        adjust_season = np.where(np.in1d(latest_seasons, intdate(ss1[i])))[0]
+        
+        if len(adjust_season):
+            adjust_season = adjust_season[0]
+            if adjust_season == 0:
+                # 最新报告期，计算最新同比
+                if ss2[i].month != 12:
+                    s2 = np.where(ss1[:i] == ss2[i])[0]
+                    s4 = np.where(ss1[:i] == ss4[i])[0]
+                    p2 = profit[s2[-1]] if len(s2) else np.nan
+                    p4 = profit[s4[-1]] if len(s4) else np.nan
+                else:
+                    p2 = 0
+                    p4 = 0
+                s3 = np.where(ss1[:i] == ss3[i])[0]
+                p3 = profit[s3[-1]] if len(s3) else np.nan
+                p1 = profit[i]
+                profit_mat.append([p1, p2, p3, p4, p1-p2, p3-p4])
+            else:
+                # 非最新报告期，但调整数据在最新报告期计算的范围内
+                profit_adjust = profit_mat[-1].copy()
+                profit_adjust[adjust_season] = 0 if                      latest_seasons[adjust_season] % 10000 == 1231 and                     adjust_season in [1, 3] else profit[i]# 调整相应季度为最新数据
+                if adjust_season <= 1:
+                    profit_adjust[4] = profit_adjust[0] - profit_adjust[1]
+                else:
+                    profit_adjust[5] = profit_adjust[2] - profit_adjust[3]
+                profit_mat.append(profit_adjust)
+        else:
+            # 非最新报告期，但调整数据不在最新报告期的计算范围内
+            profit_mat.append(profit_mat[-1])
+            
+        single_profit_pctchg.append((profit_mat[-1][4] - profit_mat[-1][5]) / abs(profit_mat[-1][5])
+                                    if profit_mat[-1][5] != 0 else np.nan)
+    profit_mat = pd.DataFrame(profit_mat, columns=['当季', 
+                                                   '上季', 
+                                                   '去年当季', 
+                                                   '去年上季', 
+                                                   '单季',
+                                                   '去年单季'])
+    single_profit_pctchg = pd.Series(single_profit_pctchg, name='同比')
+        
+    return pd.concat([pd.DataFrame({'StockID': df['StockID'],
+                                    '截止日': ss1.apply(intdate),
+                                    '公布日': df['公布日']}),
+                      profit_mat,
+                      single_profit_pctchg,
+                      df['公告类型']], axis=1)
+
+# stocklist = os.listdir('D:/数据/天软基本面数据/42.主要财务指标')
+# df_list = []
+# for stock in stocklist:
+#     df = pd.read_csv('D:/数据/天软基本面数据/42.主要财务指标/' + stock)
+#     factor = non_rec_deduct_profit_single_growth(df)
+#     df_list.append(factor)
+# factor_df = pd.concat(df_list, axis=1, join='outer')
+# factor_df.fillna(method='ffill', inplace=True)
+#     factor_df = factor_df.join(factor, how='outer')
+
+
+# In[35]:
+
+
+# 业绩预测
+# data = pd.read_csv('D:/数据/天软基本面数据/40.业绩预测/SZ000048.csv')
+# df1 = pd.DataFrame({'StockID': data['StockID'],
+#                    '截止日': data['截止日'],
+#                    '公布日': data['公布日'],
+#                    '归母净利润': (data['盈利金额上限'] + data['盈利金额下限']) * 10000/2,
+#                    '公告类型': '业绩预测'})
+#  业绩快报
+# data = pd.read_csv('D:/数据/天软基本面数据/41.业绩快报/SZ000048.csv')
+# df2 = pd.DataFrame({'StockID': data['StockID'],
+#                    '截止日': data['截止日'],
+#                    '公布日': data['公布日'],
+#                    '归母净利润': data['归属于母公司所有者净利润'],
+#                    '公告类型': '业绩快报'})
+# 正式报告
+filelist = os.listdir('D:/数据/天软基本面数据/46.合并利润分配表')
+for file in filelist:
+    data1 = pd.read_csv('D:/数据/天软基本面数据/42.主要财务指标/' + file)
+    data2 = pd.read_csv('D:/数据/天软基本面数据/46.合并利润分配表/' + file)
+    A = pd.DataFrame({'StockID': data1['StockID'],
+                      '截止日': data1['截止日'],
+                      '公布日': data1['公布日'],
+                     '首次公布扣非净利润': data1['扣除非经常性损益后的净利润']})
+    B = pd.DataFrame({'StockID': data2['StockID'],
+                       '截止日': data2['截止日'],
+                       '公布日': data2['公布日'],
+                       '归母净利润': data2['归属于母公司所有者净利润']})
+    A.index = A['截止日'].astype('str') + A['公布日'].astype('str')
+    B.index = B['截止日'].astype('str') + B['公布日'].astype('str')
+    df3 = B.join(A['首次公布扣非净利润'], how='left')
+    df3.sort_values(['公布日', '截止日'], inplace=True)
+    df3.reset_index(drop=True, inplace=True)
+    df3['非经常性损益'] = df3['归母净利润'] - df3['首次公布扣非净利润']
+    df3['非经常性损益'] = df3.groupby('截止日')['非经常性损益'].transform(lambda x: x.fillna(method='ffill'))
+    df3['扣非净利润'] = df3['归母净利润'] - df3['非经常性损益']
+    df3 = df3[['StockID', '截止日', '公布日', '扣非净利润']]
+    df3['公告类型'] = '正式报告'
+
+#     df = pd.concat([df1, df2, df3], axis=0)
+#     df.sort_values(['公布日', '截止日'], inplace=True)
+#     df.reset_index(inplace=True)
+    result = non_rec_deduct_profit_single_growth(df3)
+    result.to_csv('D:/数据/天软基本面数据/单季度扣非同比/' + file, index=False)
+
--- a/src/refs/复权因子.tslfunc
+++ b/src/refs/复权因子.tslfunc
--- a/src/refs/天软行情入库实现/天软行情字段.xlsx
+++ b/src/refs/天软行情入库实现/天软行情字段.xlsx
--- a/src/refs/天软行情入库实现/天软行情数据入库实现(20201112).tslfunc
+++ b/src/refs/天软行情入库实现/天软行情数据入库实现(20201112).tslfunc
--- a/src/refs/天软行情入库实现/天软行情数据入库实现.pdf
+++ b/src/refs/天软行情入库实现/天软行情数据入库实现.pdf
--- a/src/transform.py
+++ b/src/transform.py
@ -0,0 +1,58 @@
+
+import pandas as pd
+
+
+export_cols_bs = {
+    '存货' : 'INVENTORIES',
+    '负债合计' : 'TOT_LIAB',
+    '流动资产合计' : 'TOT_CUR_ASSETS'
+}
+
+export_cols_pl = {
+    '净利润' : 'NET_PROFIT_INCL_MIN_INT_INC',
+    '营业外收入' : 'PLUS_NON_OPER_REV',
+    '营业收入' : 'OPER_REV'
+}
+
+def adj_split(df):
+    df.reset_index(inplace=True)
+
+    df_ori = df[df['截止日'] == df['数据报告期']]
+    df_adj = df[df['截止日'] != df['数据报告期']]
+    return df_ori, df_adj
+
+
+def to_qlib_format_pl(df):
+    index_cols = ['截止日', '数据报告期', '公布日']
+    sel_cols = index_cols + export_cols_pl
+    
+    df_export = df[sel_cols]
+    df_export.set_index(index_cols, inplace=True)
+
+    df_export_ori, df_export_adj = adj_split(df_export)
+    df_export_ori.set_index(index_cols, inplace=True)
+    df_export_adj.set_index(index_cols, inplace=True)
+
+    adj_col_rename = {name : name+'（调整）' for name in export_cols_pl.keys()}
+    df_export_adj.rename(columns=adj_col_rename, inplace=True)
+
+    df_list = []
+
+    def _T(df, df_list):
+        for col in list(df.columns):
+            df_tmp = df[[col]].copy(deep=True)
+            df_tmp['field'] = col
+            df_tmp.rename(columns={col:'value'}, inplace=True)
+            df_list.append(df_tmp)
+        
+    _T(df_export_adj, df_list)
+    _T(df_export_ori, df_list)
+
+    df = pd.concat(df_list, axis=0)
+
+    return df
+
+
+def single_quarter(df):
+    DataFrame.sort_values(by, axis=0, ascending=True, inplace=False)
+
--- a/src/tsl.py
+++ b/src/tsl.py
@ -0,0 +1,251 @@
+import time
+from tqdm import tqdm
+import itertools
+from datetime import datetime
+
+import pandas as pd
+from pprint import pprint
+
+import pyTSL
+
+TINYSOFT_HOSTNAME = 'tsl.tinysoft.com.cn'
+TINYSOFT_PORT = 443
+TINYSOFT_USERNAME = 'xunuo2005'
+TINYSOFT_PASSWORD = '20220613'
+
+
+class tsl():
+    
+    def __enter__(self):
+        self.c = pyTSL.Client(TINYSOFT_USERNAME, TINYSOFT_PASSWORD, TINYSOFT_HOSTNAME, 443)
+        self.c.login()
+        return self
+        
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.c.logout()
+        del(self.c)
+        
+    
+    def get_stock_list(self, date=None, bk_name='A股'):
+
+        if date is None:
+            code = """
+                return getbk('A股;暂停上市;终止上市');
+            """
+        else:
+            code = """
+                bkName := "%s";
+                endT := %d;
+                return getAbkbydate(bkName,endT);
+            """ % (bk_name, date)
+        r = self.c.exec(code)
+        return r.value()
+    
+
+    def get_cmp_report(self, table_id, stock_id, start_date):
+        code = """
+             SetSysParam('ReportMode', -1);
+             return Select * 
+                 from infotable {table_id} of '{stock_id}' 
+                 where ['截止日'] >= {start_date}
+            end;
+        """.format(
+            table_id=table_id, 
+            stock_id=stock_id, 
+            start_date=start_date
+        )
+        r = self.c.exec(code)
+
+        df = pd.DataFrame(r.value())
+        return df
+    
+    
+    def get_cmp_indicator(self, stock_id, start_year, indicator_config_fname):
+        index_list = ['StockID', '报告期']
+
+        end_year = datetime.now().year + 1
+        year_list = range(start_year, end_year, 1)
+
+        quarter_list = ['0331', '0630', '0930', '1231']
+        report_date_list = [str(t[0]) + t[1] for t in itertools.product(year_list, quarter_list)]
+        
+        config_df = pd.read_csv(indicator_config_fname)
+        config_df['indicator_id'] = config_df['indicator_id'].astype('string')
+        indicator_dict = config_df.set_index('indicator_id')['indicator_name'].to_dict()
+        indicator_id_list = list(indicator_dict.keys())
+        indicator_name_list = ["'%s'" % indicator_dict[indicator_id].strip() for indicator_id in indicator_id_list]
+        
+        code = """
+            SetSysParam('ReportMode', -1);
+            SetSysParam(pn_reportmode(), -1);
+            SetSysParam(pn_stock(), '{stock_id}');
+
+            date_array := Array({date_list_input});
+            indicator_array := Array({indicator_id_list_input});
+            indicator_name_array := Array({indicator_name_list_input});
+            r := Array();
+            
+            for i:=0 to length(date_array)-1 do
+            begin
+                r[i]['StockID'] := '{stock_id}';
+                r[i]['报告期'] := date_array[i];
+                for j:=0 to length(indicator_array)-1 do
+                begin
+                    r[i][indicator_name_array[j]] := ReportOfAll(indicator_array[j], date_array[i]);
+                end
+            end
+            return r;
+        """.format(
+            stock_id=stock_id, 
+            date_list_input=', '.join(report_date_list),
+            indicator_id_list_input=', '.join(indicator_id_list),
+            indicator_name_list_input=', '.join(indicator_name_list)
+        )
+        r = self.c.exec(code)
+        r = r.value()
+        df = pd.DataFrame(r, index=range(len(r)))
+        df.set_index(index_list, inplace=True)
+        return df
+
+    
+    def get_mkt_trading_days(self, start_date, end_date):
+        # TradeDays: 
+        # nday3: 返回用第二个参数计算N日的一维数字下标数组。
+        # SpecDate: 临时修改当前时间的快捷方法
+        # Spec: 指定StockID股票为当前股票，计算表达式的值返回
+        code = """
+           begt:=inttodate(%d);
+           endt:=inttodate(%d);
+           return spec(specdate(nday3(tradedays(begt,endt),DateToInt(sp_time())),endt),'SH000001');
+        """ % (start_date, end_date);
+        r = self.c.exec(code)
+        
+        return r.value()
+    
+    
+    def get_mkt_stock_k_1min(self, date, stock_list):
+        stock_list_input = ', '.join(['\'%s\'' % stock_id for stock_id in stock_list])
+        code = """
+            SetSysParam(pn_cycle(), cy_1m());
+            stock_list := Array({stock_list});
+            r := select 
+                ['StockID'],
+                DateTimeToStr(['date']) as 'time',
+                ['open'],
+                ['high'],
+                ['low'],
+                ['close'],
+                ['vol'],
+                ['amount'],
+                ['yclose'] as 'AdjPreClose',
+                (['close']/['yclose'] - 1) * 100 as 'PctChg' 
+                from markettable
+                datekey {start_date}T to {end_date}T+0.999 
+                of stock_list
+            end;
+            
+            return r;        
+        """.format(
+            stock_list=stock_list_input,
+            start_date=date,
+            end_date=date
+        )
+        
+        r = self.c.exec(code)
+        return r
+        
+    
+    def get_mkt_stock_k_daily(self, start_date, end_date, stock_id): 
+        # vol: 成交量
+        # amount: 成交金额
+        # cjbs: 成交笔数
+        # yclose: 上次价
+        code = """
+            SetSysParam(pn_cycle(),cy_day());
+            return 
+                select 
+                    ['StockID'],
+                    datetoint(['date']) as 'date',
+                    ['open'],
+                    ['high'],
+                    ['low'],
+                    ['close'], 
+                    ['vol'], 
+                    ['amount'], 
+                    ['cjbs'],
+                    ['yclose']
+                from markettable 
+                datekey {start_date}T to {end_date}T+0.999 
+                of \'{stock_id}\'
+            end;
+        """.format(
+            stock_id=stock_id, 
+            start_date=start_date,
+            end_date=end_date
+        )
+        r = self.c.exec(code)
+        df = pd.DataFrame(r.value())
+        if 'StockID' in df.columns and 'date' in df.columns:
+            df.set_index(['StockID', 'date'], inplace=True)
+        return df, r
+   
+
+    def get_mkt_stock_k_daily_ext(self, start_date, end_date, stock_id):
+        
+        date_list = self.get_mkt_trading_days(start_date, end_date)
+        
+        # IsSt_3: 是否是ST(指定日)，判断当前股票在指定日是否是ST股。函数主要是通过名称变更进行判断。 
+        # Stockzf3：涨幅(%) =（收盘-昨收）/昨收*100%（已复权）
+        #     http://www.tinysoft.com.cn/tsdn/helpdoc/display.tsl?id=12582
+        # AuthorityFactor: 以上市日为复权基准日，向后复权，得到指定日比例复权方式的复权因子值，即复权方式为1。调用该函数需注意设置当前股票和当前时间。
+        code = """
+            SetSysParam(pn_cycle(),cy_day());
+            SetSysParam(pn_stock(), \'{stock_id}\');
+            
+            date_array := Array({date_list_input});
+            r := Array();
+            
+            for i:=0 to length(date_array)-1 do
+            begin
+                SetSysParam(PN_Date(), inttodate(date_array[i]));
+                
+                r[i]['StockID'] := \'{stock_id}\';
+                r[i]['date'] := date_array[i];
+                r[i]['PctChg'] := StockZf3();
+                r[i]['IsST'] := IsST_3(date_array[i]);
+                r[i]['FloatShares'] := StockNegotiableShares(date_array[i]);
+                r[i]['Factor'] := AuthorityFactor();
+            end;
+            return r;
+        """.format(
+            stock_id=stock_id,
+            date_list_input=', '.join([str(date) for date in date_list])
+        )
+        r = self.c.exec(code)
+
+        df = pd.DataFrame(r.value(), index=range(len(r.value())))
+        if 'StockID' in df.columns and 'date' in df.columns:
+            df.set_index(['StockID', 'date'], inplace=True)
+        return df, r
+        
+    
+    def get_fund_info(self, table_id, fund_id):
+        code = """
+             SetSysParam('ReportMode', -1);
+             return Select * 
+                 from infotable {table_id} of '{fund_id}' 
+            end;
+        """.format(
+            table_id=table_id, 
+            fund_id=fund_id
+        )
+        r = self.c.exec(code)
+
+        df = pd.DataFrame(r.value())
+        return df
+    
+    
+    def get_fund_list(self, date):
+        pass
+
--- a/ts-api.ipynb
+++ b/ts-api.ipynb
--- a/ts-financial-basic.ipynb
+++ b/ts-financial-basic.ipynb
--- a/ts-financial-indicator.ipynb
+++ b/ts-financial-indicator.ipynb
@ -0,0 +1,967 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8dd1bdcc-05d0-4623-b683-4eec9339a4ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "094cfb08-c902-4d37-a8d3-4600dd371140",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('tinysoft-data/44.合并资产负债表/SH600058.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "f5605ce4-c5f1-4bf8-afe5-4eb360fcd26c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pprint import pprint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "8b7abc47-accc-47d1-8f16-d5206b410b16",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "range(9900000, 9900028)\n"
+     ]
+    }
+   ],
+   "source": [
+    "pershare_indicator_id_list = range(9900000, 9900028)\n",
+    "print(pershare_indicator_id_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f86a7c11-2ecb-4d7c-bdc0-29486b2ee86d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pyTSL\n",
+    "import itertools\n",
+    "import pandas as pd\n",
+    "from pprint import pprint\n",
+    "\n",
+    "TINYSOFT_HOSTNAME = 'tsl.tinysoft.com.cn'\n",
+    "TINYSOFT_PORT = 443\n",
+    "TINYSOFT_USERNAME = 'xunuo2005'\n",
+    "TINYSOFT_PASSWORD = '20220613'\n",
+    "\n",
+    "TINYSOFT_DATA_PATH = 'tinysoft-data'\n",
+    "\n",
+    "class tsl():\n",
+    "    \n",
+    "    def __enter__(self):\n",
+    "        self.c = pyTSL.Client(TINYSOFT_USERNAME, TINYSOFT_PASSWORD, TINYSOFT_HOSTNAME, 443)\n",
+    "        status = self.c.login()\n",
+    "        return self\n",
+    "        \n",
+    "    \n",
+    "    def __exit__(self, exc_type, exc_val, exc_tb):\n",
+    "        self.c.logout()\n",
+    "    \n",
+    "    \n",
+    "    def retrieve_stock_report(self, table_id, stock_id, start_date):\n",
+    "        code = \"\"\"\n",
+    "             SetSysParam('ReportMode', -1);\n",
+    "             return Select * \n",
+    "                 from infotable {table_id} of '{stock_id}' \n",
+    "                 where ['截止日'] >= {start_date}\n",
+    "            end;\n",
+    "        \"\"\".format(table_id=table_id, stock_id=stock_id, start_date=start_date)\n",
+    "        r = self.c.exec(code)\n",
+    "\n",
+    "        df = pd.DataFrame(r.value())\n",
+    "        return df\n",
+    "    \n",
+    "    \n",
+    "    def retrieve_stock_indicator(self, stock_id, start_year, indicator_config_fname):\n",
+    "        year_list = range(start_year, 2022, 1)\n",
+    "        quarter_list = ['0331', '0630', '0930', '1231']\n",
+    "        report_date_list = [str(t[0]) + t[1] for t in itertools.product(year_list, quarter_list)]\n",
+    "        \n",
+    "        config_df = pd.read_csv(indicator_config_fname)\n",
+    "        config_df['indicator_id'] = config_df['indicator_id'].astype('string')\n",
+    "        indicator_dict = config_df.set_index('indicator_id')['indicator_name'].to_dict()\n",
+    "        indicator_id_list = list(indicator_dict.keys())\n",
+    "        indicator_name_list = [\"'%s'\" % indicator_dict[indicator_id] for indicator_id in indicator_id_list]\n",
+    "        \n",
+    "        code = \"\"\"\n",
+    "            SetSysParam('ReportMode', -1);\n",
+    "            SetSysParam(pn_reportmode(), -1);\n",
+    "            SetSysParam(pn_stock(), '{stock_id}');\n",
+    "\n",
+    "            date_array := Array({date_list_input});\n",
+    "            indicator_array := Array({indicator_id_list_input});\n",
+    "            indicator_name_array := Array({indicator_name_list_input});\n",
+    "            r := Array();\n",
+    "            \n",
+    "            for i:=0 to length(date_array)-1 do\n",
+    "            begin\n",
+    "                r[i]['StockID'] := 'stock_id';\n",
+    "                r[i]['报告期'] := date_array[i];\n",
+    "                for j:=0 to length(indicator_array)-1 do\n",
+    "                begin\n",
+    "                    r[i][indicator_name_array[j]] := ReportOfAll(indicator_array[j], date_array[i]);\n",
+    "                end\n",
+    "            end\n",
+    "            return r;\n",
+    "        \"\"\".format(\n",
+    "            stock_id=stock_id, \n",
+    "            date_list_input=', '.join(report_date_list),\n",
+    "            indicator_id_list_input=', '.join(indicator_id_list),\n",
+    "            indicator_name_list_input=', '.join(indicator_name_list)\n",
+    "        )\n",
+    "        r = self.c.exec(code)\n",
+    "        r = r.value()\n",
+    "        df = pd.DataFrame(r, index=range(len(r)))\n",
+    "        df.set_index(['StockID', '报告期'], inplace=True)\n",
+    "        return df\n",
+    "        \n",
+    "\n",
+    "    def get_stock_list(self, date, bk_name='A股'):\n",
+    "        code = \"\"\"\n",
+    "            bkName := \"%s\";\n",
+    "            endT := %d;\n",
+    "            return getAbkbydate(bkName,endT);\n",
+    "        \"\"\" % (bk_name, date)\n",
+    "        r = self.c.exec(code)\n",
+    "        \n",
+    "        return r.value()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e11e4a42-1855-4602-95ec-d3ddc3f509e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with tsl() as t:\n",
+    "    #r = t.retrieve_stock_report(46, 'SZ000002', 2012)\n",
+    "    df = t.retrieve_stock_indicator('SZ000002', 2012, TINYSOFT_DATA_PATH + '/indicator-config/每股指标.csv')\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e2719d27-0e15-4b87-9e32-a2241365dcd6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>每股营业收入(最新摊薄)</th>\n",
+       "      <th>每股营业收入</th>\n",
+       "      <th>每股息税前利润</th>\n",
+       "      <th>每股经营现金流(最新摊薄)</th>\n",
+       "      <th>每股公积金</th>\n",
+       "      <th>每股利润总额</th>\n",
+       "      <th>每股资本公积金</th>\n",
+       "      <th>每股负债</th>\n",
+       "      <th>每股收益(最新摊薄)</th>\n",
+       "      <th>每股短期负债</th>\n",
+       "      <th>...</th>\n",
+       "      <th>每股净资产增长率(%)</th>\n",
+       "      <th>每股净资产</th>\n",
+       "      <th>每股经营活动现金流量净额增长率(%)</th>\n",
+       "      <th>每股筹资现金流量净额</th>\n",
+       "      <th>每股现金及现金等价物</th>\n",
+       "      <th>每股收益增长率(%)</th>\n",
+       "      <th>每股现金净流量</th>\n",
+       "      <th>每股经营活动现金流量净额</th>\n",
+       "      <th>每股营业利润</th>\n",
+       "      <th>每股投资现金流量净额</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>StockID</th>\n",
+       "      <th>报告期</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"5\" valign=\"top\">stock_id</th>\n",
+       "      <th>20120331</th>\n",
+       "      <td>0.889942</td>\n",
+       "      <td>0.940947</td>\n",
+       "      <td>0.197014</td>\n",
+       "      <td>0.110114</td>\n",
+       "      <td>2.048748</td>\n",
+       "      <td>0.187185</td>\n",
+       "      <td>0.807414</td>\n",
+       "      <td>21.947356</td>\n",
+       "      <td>0.120077</td>\n",
+       "      <td>18.494247</td>\n",
+       "      <td>...</td>\n",
+       "      <td>19.382684</td>\n",
+       "      <td>4.943817</td>\n",
+       "      <td>164.809374</td>\n",
+       "      <td>0.329418</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>15.832056</td>\n",
+       "      <td>0.430057</td>\n",
+       "      <td>0.116425</td>\n",
+       "      <td>0.185253</td>\n",
+       "      <td>-0.015415</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20120630</th>\n",
+       "      <td>2.642751</td>\n",
+       "      <td>2.794216</td>\n",
+       "      <td>0.585350</td>\n",
+       "      <td>0.242503</td>\n",
+       "      <td>2.037421</td>\n",
+       "      <td>0.550119</td>\n",
+       "      <td>0.796087</td>\n",
+       "      <td>23.698247</td>\n",
+       "      <td>0.320427</td>\n",
+       "      <td>19.745063</td>\n",
+       "      <td>...</td>\n",
+       "      <td>19.251969</td>\n",
+       "      <td>5.011256</td>\n",
+       "      <td>-26.311889</td>\n",
+       "      <td>0.924993</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>25.092911</td>\n",
+       "      <td>1.100990</td>\n",
+       "      <td>0.256402</td>\n",
+       "      <td>0.546814</td>\n",
+       "      <td>-0.078620</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20120930</th>\n",
+       "      <td>3.967888</td>\n",
+       "      <td>4.195301</td>\n",
+       "      <td>0.802472</td>\n",
+       "      <td>-0.300253</td>\n",
+       "      <td>2.030278</td>\n",
+       "      <td>0.742686</td>\n",
+       "      <td>0.788945</td>\n",
+       "      <td>25.114003</td>\n",
+       "      <td>0.436965</td>\n",
+       "      <td>21.241522</td>\n",
+       "      <td>...</td>\n",
+       "      <td>20.100036</td>\n",
+       "      <td>5.122391</td>\n",
+       "      <td>-133.422541</td>\n",
+       "      <td>0.368574</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>41.741473</td>\n",
+       "      <td>-0.134086</td>\n",
+       "      <td>-0.317461</td>\n",
+       "      <td>0.741498</td>\n",
+       "      <td>-0.183924</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20121231</th>\n",
+       "      <td>8.869922</td>\n",
+       "      <td>9.378288</td>\n",
+       "      <td>1.980662</td>\n",
+       "      <td>0.320502</td>\n",
+       "      <td>2.337464</td>\n",
+       "      <td>1.916306</td>\n",
+       "      <td>0.789786</td>\n",
+       "      <td>26.981150</td>\n",
+       "      <td>1.079636</td>\n",
+       "      <td>23.631523</td>\n",
+       "      <td>...</td>\n",
+       "      <td>20.498793</td>\n",
+       "      <td>5.804851</td>\n",
+       "      <td>9.928939</td>\n",
+       "      <td>1.481146</td>\n",
+       "      <td>4.649318</td>\n",
+       "      <td>30.403585</td>\n",
+       "      <td>1.592158</td>\n",
+       "      <td>0.338871</td>\n",
+       "      <td>1.911109</td>\n",
+       "      <td>-0.223138</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20130331</th>\n",
+       "      <td>1.204253</td>\n",
+       "      <td>1.273233</td>\n",
+       "      <td>0.245067</td>\n",
+       "      <td>-0.205005</td>\n",
+       "      <td>2.349401</td>\n",
+       "      <td>0.217805</td>\n",
+       "      <td>0.801771</td>\n",
+       "      <td>30.309328</td>\n",
+       "      <td>0.138826</td>\n",
+       "      <td>27.214376</td>\n",
+       "      <td>...</td>\n",
+       "      <td>20.636496</td>\n",
+       "      <td>5.964048</td>\n",
+       "      <td>-286.168893</td>\n",
+       "      <td>0.266879</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>15.610817</td>\n",
+       "      <td>-0.014492</td>\n",
+       "      <td>-0.216748</td>\n",
+       "      <td>0.217828</td>\n",
+       "      <td>-0.063866</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 27 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   每股营业收入(最新摊薄)    每股营业收入   每股息税前利润  每股经营现金流(最新摊薄)     每股公积金  \\\n",
+       "StockID  报告期                                                                   \n",
+       "stock_id 20120331      0.889942  0.940947  0.197014       0.110114  2.048748   \n",
+       "         20120630      2.642751  2.794216  0.585350       0.242503  2.037421   \n",
+       "         20120930      3.967888  4.195301  0.802472      -0.300253  2.030278   \n",
+       "         20121231      8.869922  9.378288  1.980662       0.320502  2.337464   \n",
+       "         20130331      1.204253  1.273233  0.245067      -0.205005  2.349401   \n",
+       "\n",
+       "                     每股利润总额   每股资本公积金       每股负债  每股收益(最新摊薄)     每股短期负债  ...  \\\n",
+       "StockID  报告期                                                             ...   \n",
+       "stock_id 20120331  0.187185  0.807414  21.947356    0.120077  18.494247  ...   \n",
+       "         20120630  0.550119  0.796087  23.698247    0.320427  19.745063  ...   \n",
+       "         20120930  0.742686  0.788945  25.114003    0.436965  21.241522  ...   \n",
+       "         20121231  1.916306  0.789786  26.981150    1.079636  23.631523  ...   \n",
+       "         20130331  0.217805  0.801771  30.309328    0.138826  27.214376  ...   \n",
+       "\n",
+       "                   每股净资产增长率(%)     每股净资产  每股经营活动现金流量净额增长率(%)  每股筹资现金流量净额  \\\n",
+       "StockID  报告期                                                               \n",
+       "stock_id 20120331    19.382684  4.943817          164.809374    0.329418   \n",
+       "         20120630    19.251969  5.011256          -26.311889    0.924993   \n",
+       "         20120930    20.100036  5.122391         -133.422541    0.368574   \n",
+       "         20121231    20.498793  5.804851            9.928939    1.481146   \n",
+       "         20130331    20.636496  5.964048         -286.168893    0.266879   \n",
+       "\n",
+       "                   每股现金及现金等价物  每股收益增长率(%)   每股现金净流量  每股经营活动现金流量净额    每股营业利润  \\\n",
+       "StockID  报告期                                                                  \n",
+       "stock_id 20120331    0.000000   15.832056  0.430057      0.116425  0.185253   \n",
+       "         20120630    0.000000   25.092911  1.100990      0.256402  0.546814   \n",
+       "         20120930    0.000000   41.741473 -0.134086     -0.317461  0.741498   \n",
+       "         20121231    4.649318   30.403585  1.592158      0.338871  1.911109   \n",
+       "         20130331    0.000000   15.610817 -0.014492     -0.216748  0.217828   \n",
+       "\n",
+       "                   每股投资现金流量净额  \n",
+       "StockID  报告期                   \n",
+       "stock_id 20120331   -0.015415  \n",
+       "         20120630   -0.078620  \n",
+       "         20120930   -0.183924  \n",
+       "         20121231   -0.223138  \n",
+       "         20130331   -0.063866  \n",
+       "\n",
+       "[5 rows x 27 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "0f435014-7fc1-4514-ab44-ef6c3430e1ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['每股营业收入(最新摊薄)', '每股营业收入', '每股息税前利润', '每股经营现金流(最新摊薄)', '每股公积金', '每股利润总额',\n",
+       "       '每股资本公积金', '每股负债', '每股收益(最新摊薄)', '每股短期负债', '每股未分配利润', '每股收益', '每股股利',\n",
+       "       '每股净资产(最新摊薄)', '每股长期负债', '扣除非经常损益后的每股收益', '每股盈余公积金', '每股净资产增长率(%)',\n",
+       "       '每股净资产', '每股经营活动现金流量净额增长率(%)', '每股筹资现金流量净额', '每股现金及现金等价物', '每股收益增长率(%)',\n",
+       "       '每股现金净流量', '每股经营活动现金流量净额', '每股营业利润', '每股投资现金流量净额'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "fc39372e-5f8a-4043-8758-6102bd026ed8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "id": "13369644-5d79-4331-ab4c-582ff1e967dd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['indicator_id', 'indicator_name', 'indicator_function', 'indicator_comment']"
+      ]
+     },
+     "execution_count": 75,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv('tinysoft-data/每股指标.csv')\n",
+    "df.columns.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "a3d0b4f2-5ed5-4048-b1f4-6887c915c99b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "indicator_id           int64\n",
+       "indicator_name        object\n",
+       "indicator_function    object\n",
+       "indicator_comment     object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.dtypes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "id": "2b1dcf5e-820f-4db5-a609-75ac76602424",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'9900000': '每股收益',\n",
+       " '9900001': '扣除非经常损益后的每股收益',\n",
+       " '9900002': '每股收益增长率(%)',\n",
+       " '9900003': '每股净资产',\n",
+       " '9900004': '每股净资产增长率(%)',\n",
+       " '9900005': '每股经营活动现金流量净额',\n",
+       " '9900006': '每股经营活动现金流量净额增长率(%)',\n",
+       " '9900007': '每股投资现金流量净额',\n",
+       " '9900008': '每股筹资现金流量净额',\n",
+       " '9900009': '每股现金及现金等价物',\n",
+       " '9900010': '每股现金净流量',\n",
+       " '9900011': '每股营业收入',\n",
+       " '9900013': '每股营业利润',\n",
+       " '9900014': '每股息税前利润',\n",
+       " '9900015': '每股利润总额',\n",
+       " '9900017': '每股盈余公积金',\n",
+       " '9900018': '每股资本公积金',\n",
+       " '9900019': '每股公积金',\n",
+       " '9900020': '每股股利',\n",
+       " '9900021': '每股未分配利润',\n",
+       " '9900022': '每股短期负债',\n",
+       " '9900023': '每股长期负债',\n",
+       " '9900024': '每股负债',\n",
+       " '9900025': '每股收益(最新摊薄)',\n",
+       " '9900026': '每股净资产(最新摊薄)',\n",
+       " '9900027': '每股营业收入(最新摊薄)',\n",
+       " '9900028': '每股经营现金流(最新摊薄)'}"
+      ]
+     },
+     "execution_count": 78,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['indicator_id'] = df['indicator_id'].astype('string')\n",
+    "df.set_index('indicator_id')['indicator_name'].to_dict()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "id": "156c2f94-95b0-4c6f-99d8-9e5260cecb4d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "indicator_dict = df.set_index('indicator_id')['indicator_name'].to_dict()    \n",
+    "indicator_id_list = list(indicator_dict.keys())\n",
+    "indicator_name_list = [\"'%s'\" % indicator_dict[indicator_id] for indicator_id in indicator_id_list]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "1d938e71-fb9e-4d09-9684-bb5db80c96f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'9900000, 9900001, 9900002, 9900003, 9900004, 9900005, 9900006, 9900007, 9900008, 9900009, 9900010, 9900011, 9900013, 9900014, 9900015, 9900017, 9900018, 9900019, 9900020, 9900021, 9900022, 9900023, 9900024, 9900025, 9900026, 9900027, 9900028'"
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "', '.join(indicator_id_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "cede5b2d-87c6-4809-9262-95a3a18a5577",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"'每股收益', '扣除非经常损益后的每股收益', '每股收益增长率(%)', '每股净资产', '每股净资产增长率(%)', '每股经营活动现金流量净额', '每股经营活动现金流量净额增长率(%)', '每股投资现金流量净额', '每股筹资现金流量净额', '每股现金及现金等价物', '每股现金净流量', '每股营业收入', '每股营业利润', '每股息税前利润', '每股利润总额', '每股盈余公积金', '每股资本公积金', '每股公积金', '每股股利', '每股未分配利润', '每股短期负债', '每股长期负债', '每股负债', '每股收益(最新摊薄)', '每股净资产(最新摊薄)', '每股营业收入(最新摊薄)', '每股经营现金流(最新摊薄)'\""
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "', '.join(indicator_name_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "7d9b9940-9fda-4a3c-8b08-7f6d7cad4119",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "year_list = range(2012, 2022, 1)\n",
+    "quarter_list = ['0331', '0630', '0930', '1231']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "id": "0055592f-d1db-4328-88ec-1dcbc938a2de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import itertools\n",
+    "\n",
+    "report_date_list = [str(t[0]) + t[1] for t in itertools.product(year_list, quarter_list)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7937df2c-62b2-4c2e-8689-095744ceaaf9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2412ad25-8577-4a4c-84f4-ba7b02d5af15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('./tinysoft-data/基础报表/40.业绩预测/SH600009.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "37be2152-89bf-465c-ae70-625f714256c0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>StockID</th>\n",
+       "      <th>StockName</th>\n",
+       "      <th>截止日</th>\n",
+       "      <th>公布日</th>\n",
+       "      <th>预警内容</th>\n",
+       "      <th>预警类型</th>\n",
+       "      <th>比上年同期增长数据来源</th>\n",
+       "      <th>预警详情</th>\n",
+       "      <th>备注</th>\n",
+       "      <th>盈利金额上限</th>\n",
+       "      <th>盈利金额单位</th>\n",
+       "      <th>盈利金额下限</th>\n",
+       "      <th>比上年同期增长下限(%)</th>\n",
+       "      <th>比上年同期增长上限(%)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>SH600009</td>\n",
+       "      <td>上海机场</td>\n",
+       "      <td>20200630</td>\n",
+       "      <td>20200429</td>\n",
+       "      <td>此次疫情对公司各项业务开展和客户经营影响较大，公司经营压力持续加大，营业收入预计将有较大幅度...</td>\n",
+       "      <td>预警</td>\n",
+       "      <td>公告中公布</td>\n",
+       "      <td>2020年以来，全球航空业因新冠肺炎疫情爆发面临严峻挑战，国际机场协会和国际民航组织先后强调...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>SH600009</td>\n",
+       "      <td>上海机场</td>\n",
+       "      <td>20200930</td>\n",
+       "      <td>20200829</td>\n",
+       "      <td>预测年初至下一报告期期末的累计净利润可能为亏损</td>\n",
+       "      <td>预警</td>\n",
+       "      <td>公告中公布</td>\n",
+       "      <td>2020年以来，全球航空业因新冠肺炎疫情爆发面临严峻挑战，国际机场协会和国际民航组织\\r\\n...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>SH600009</td>\n",
+       "      <td>上海机场</td>\n",
+       "      <td>20201231</td>\n",
+       "      <td>20201031</td>\n",
+       "      <td>，营业收入预计将有较大幅度下降</td>\n",
+       "      <td>预减</td>\n",
+       "      <td>公告中公布</td>\n",
+       "      <td>2020年以来，全球航空业因新冠肺炎疫情爆发面临严峻挑战，国际机场协会和国际民航组织先后强调...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>SH600009</td>\n",
+       "      <td>上海机场</td>\n",
+       "      <td>20201231</td>\n",
+       "      <td>20210130</td>\n",
+       "      <td>净利润亏损121000万元-129000万元</td>\n",
+       "      <td>预亏</td>\n",
+       "      <td>天软计算</td>\n",
+       "      <td>2020年以来，全球航空业因新冠肺炎疫情爆发面临严峻挑战，\\r\\n行业受疫情影响严重、所面临...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>-121000.0</td>\n",
+       "      <td>万元</td>\n",
+       "      <td>-129000.0</td>\n",
+       "      <td>-125.6451</td>\n",
+       "      <td>-124.0547</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>SH600009</td>\n",
+       "      <td>上海机场</td>\n",
+       "      <td>20210630</td>\n",
+       "      <td>20210429</td>\n",
+       "      <td>预计业绩仍将亏损</td>\n",
+       "      <td>预警</td>\n",
+       "      <td>公告中公布</td>\n",
+       "      <td>当前，疫情仍是影响行业恢复发展的最大不确定性因素，境外疫情形势依然严峻复杂，本土疫情呈零星散...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    StockID StockName       截止日       公布日  \\\n",
+       "0  SH600009      上海机场  20200630  20200429   \n",
+       "1  SH600009      上海机场  20200930  20200829   \n",
+       "2  SH600009      上海机场  20201231  20201031   \n",
+       "3  SH600009      上海机场  20201231  20210130   \n",
+       "4  SH600009      上海机场  20210630  20210429   \n",
+       "\n",
+       "                                                预警内容 预警类型 比上年同期增长数据来源  \\\n",
+       "0  此次疫情对公司各项业务开展和客户经营影响较大，公司经营压力持续加大，营业收入预计将有较大幅度...   预警       公告中公布   \n",
+       "1                            预测年初至下一报告期期末的累计净利润可能为亏损   预警       公告中公布   \n",
+       "2                                    ，营业收入预计将有较大幅度下降   预减       公告中公布   \n",
+       "3                             净利润亏损121000万元-129000万元   预亏        天软计算   \n",
+       "4                                           预计业绩仍将亏损   预警       公告中公布   \n",
+       "\n",
+       "                                                预警详情  备注    盈利金额上限 盈利金额单位  \\\n",
+       "0  2020年以来，全球航空业因新冠肺炎疫情爆发面临严峻挑战，国际机场协会和国际民航组织先后强调... NaN       0.0    NaN   \n",
+       "1  2020年以来，全球航空业因新冠肺炎疫情爆发面临严峻挑战，国际机场协会和国际民航组织\\r\\n... NaN       0.0    NaN   \n",
+       "2  2020年以来，全球航空业因新冠肺炎疫情爆发面临严峻挑战，国际机场协会和国际民航组织先后强调... NaN       0.0    NaN   \n",
+       "3  2020年以来，全球航空业因新冠肺炎疫情爆发面临严峻挑战，\\r\\n行业受疫情影响严重、所面临... NaN -121000.0     万元   \n",
+       "4  当前，疫情仍是影响行业恢复发展的最大不确定性因素，境外疫情形势依然严峻复杂，本土疫情呈零星散... NaN       0.0    NaN   \n",
+       "\n",
+       "     盈利金额下限  比上年同期增长下限(%)  比上年同期增长上限(%)  \n",
+       "0       0.0        0.0000        0.0000  \n",
+       "1       0.0        0.0000        0.0000  \n",
+       "2       0.0        0.0000        0.0000  \n",
+       "3 -129000.0     -125.6451     -124.0547  \n",
+       "4       0.0        0.0000        0.0000  "
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()['']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "45fc86a4-aeda-401f-9fdd-30732e195a3f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['StockID', 'StockName', '截止日', '公布日', '营业总收入', '每股净资产', '归属母公司股东权益合计',\n",
+       "       '营业利润', '股本', '资产总计', '利润总额', '净资产收益率(加权)(%)', '基本每股收益',\n",
+       "       '归属于母公司所有者净利润'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "25a034a6-495c-4f10-b800-3f34fd7881cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>报告期</th>\n",
+       "      <th>20200630</th>\n",
+       "      <th>20200930</th>\n",
+       "      <th>20201231</th>\n",
+       "      <th>20210331</th>\n",
+       "      <th>20210630</th>\n",
+       "      <th>20210930</th>\n",
+       "      <th>20211231</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>净资产收益率(%)</th>\n",
+       "      <td>3.892464</td>\n",
+       "      <td>6.242980</td>\n",
+       "      <td>7.944394</td>\n",
+       "      <td>2.719146</td>\n",
+       "      <td>4.661539</td>\n",
+       "      <td>7.492452</td>\n",
+       "      <td>9.188566</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>销售净利率(%)</th>\n",
+       "      <td>17.462466</td>\n",
+       "      <td>19.215195</td>\n",
+       "      <td>18.840448</td>\n",
+       "      <td>24.246195</td>\n",
+       "      <td>20.764053</td>\n",
+       "      <td>22.906675</td>\n",
+       "      <td>21.451976</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>销售毛利率(%)</th>\n",
+       "      <td>22.544939</td>\n",
+       "      <td>24.812978</td>\n",
+       "      <td>24.038374</td>\n",
+       "      <td>30.939504</td>\n",
+       "      <td>26.470241</td>\n",
+       "      <td>28.955893</td>\n",
+       "      <td>27.148533</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "报告期         20200630   20200930   20201231   20210331   20210630   20210930  \\\n",
+       "净资产收益率(%)   3.892464   6.242980   7.944394   2.719146   4.661539   7.492452   \n",
+       "销售净利率(%)   17.462466  19.215195  18.840448  24.246195  20.764053  22.906675   \n",
+       "销售毛利率(%)   22.544939  24.812978  24.038374  30.939504  26.470241  28.955893   \n",
+       "\n",
+       "报告期         20211231  \n",
+       "净资产收益率(%)   9.188566  \n",
+       "销售净利率(%)   21.451976  \n",
+       "销售毛利率(%)   27.148533  "
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.set_index('报告期')[['净资产收益率(%)', '销售净利率(%)', '销售毛利率(%)']].tail(7).T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e2a7737-f6a2-4990-8f98-e31cfa7ae9e1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/ts-k-1min.ipynb
+++ b/ts-k-1min.ipynb
@ -0,0 +1,592 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "7c265be0-6004-4685-aa82-f10e186eb417",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from pprint import pprint\n",
+    "\n",
+    "from src.tsl import tsl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "1a06cac5-7690-43a2-adde-a30217e05469",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stock_id = 'SZ000019'\n",
+    "start_date = 20220226\n",
+    "end_date = 20220228"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1f86064d-adbb-48a3-926e-5a3716f50aca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with tsl() as t:\n",
+    "    stock_list = t.get_stock_list(20220630)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "0d923151-698c-40a2-86fb-b325972d71d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stock_list_input = ', '.join(['\\'%s\\'' % stock_id for stock_id in stock_list])\n",
+    "date = 20220630"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "34b7196a-6f31-48b3-8764-96ad6f5a95ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "code = \"\"\"\n",
+    "        SetSysParam(pn_cycle(), cy_1m());\n",
+    "\n",
+    "        stock_list := Array({stock_list});\n",
+    "        r := select \n",
+    "            ['StockID'],\n",
+    "            DateTimeToStr(['date']) as 'time',\n",
+    "            ['open'],\n",
+    "            ['high'],\n",
+    "            ['low'],\n",
+    "            ['close'],\n",
+    "            ['vol'],\n",
+    "            ['amount'],\n",
+    "            ['yclose'] as 'AdjPreClose',\n",
+    "            (['close']/['yclose'] - 1) * 100 as 'PctChg' \n",
+    "            from markettable\n",
+    "            datekey {start_date}T to {end_date}T+0.999 \n",
+    "            of stock_list\n",
+    "        end;\n",
+    "        \n",
+    "        return r;\n",
+    "    \"\"\".format(\n",
+    "        stock_list=stock_list_input,\n",
+    "        start_date=date,\n",
+    "        end_date=date\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "a7203971-d654-477d-8b46-a3a6cabc09a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with tsl() as t:\n",
+    "    r = t.c.exec(code)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "1472c1b0-0c9d-4b43-aa1b-563fd22ab2f2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<bound method PyCapsule.message of TSResultValue<+OK>>"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.DataFrame(r.value())\n",
+    "r.message"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "3e3bfa5d-3f74-4653-b74f-05b7cfe5cd42",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>open</th>\n",
+       "      <th>time</th>\n",
+       "      <th>close</th>\n",
+       "      <th>AdjPreClose</th>\n",
+       "      <th>vol</th>\n",
+       "      <th>low</th>\n",
+       "      <th>StockID</th>\n",
+       "      <th>PctChg</th>\n",
+       "      <th>amount</th>\n",
+       "      <th>high</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>43.80</td>\n",
+       "      <td>2022-06-30 09:31:00</td>\n",
+       "      <td>44.30</td>\n",
+       "      <td>42.60</td>\n",
+       "      <td>137348.0</td>\n",
+       "      <td>43.80</td>\n",
+       "      <td>SH688181</td>\n",
+       "      <td>3.990610</td>\n",
+       "      <td>6054242.00</td>\n",
+       "      <td>44.49</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>44.38</td>\n",
+       "      <td>2022-06-30 09:32:00</td>\n",
+       "      <td>44.32</td>\n",
+       "      <td>44.30</td>\n",
+       "      <td>57212.0</td>\n",
+       "      <td>44.30</td>\n",
+       "      <td>SH688181</td>\n",
+       "      <td>0.045147</td>\n",
+       "      <td>2538700.00</td>\n",
+       "      <td>44.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>44.13</td>\n",
+       "      <td>2022-06-30 09:33:00</td>\n",
+       "      <td>44.10</td>\n",
+       "      <td>44.32</td>\n",
+       "      <td>22417.0</td>\n",
+       "      <td>44.05</td>\n",
+       "      <td>SH688181</td>\n",
+       "      <td>-0.496390</td>\n",
+       "      <td>989985.00</td>\n",
+       "      <td>44.30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>44.07</td>\n",
+       "      <td>2022-06-30 09:34:00</td>\n",
+       "      <td>44.14</td>\n",
+       "      <td>44.10</td>\n",
+       "      <td>43188.0</td>\n",
+       "      <td>44.07</td>\n",
+       "      <td>SH688181</td>\n",
+       "      <td>0.090703</td>\n",
+       "      <td>1906572.00</td>\n",
+       "      <td>44.25</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>44.14</td>\n",
+       "      <td>2022-06-30 09:35:00</td>\n",
+       "      <td>44.08</td>\n",
+       "      <td>44.14</td>\n",
+       "      <td>34658.0</td>\n",
+       "      <td>44.08</td>\n",
+       "      <td>SH688181</td>\n",
+       "      <td>-0.135931</td>\n",
+       "      <td>1529313.00</td>\n",
+       "      <td>44.15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1152715</th>\n",
+       "      <td>61.41</td>\n",
+       "      <td>2022-06-30 14:56:00</td>\n",
+       "      <td>61.36</td>\n",
+       "      <td>61.40</td>\n",
+       "      <td>58300.0</td>\n",
+       "      <td>61.36</td>\n",
+       "      <td>SZ301302</td>\n",
+       "      <td>-0.065147</td>\n",
+       "      <td>3579031.00</td>\n",
+       "      <td>61.41</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1152716</th>\n",
+       "      <td>61.38</td>\n",
+       "      <td>2022-06-30 14:57:00</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>61.36</td>\n",
+       "      <td>61383.0</td>\n",
+       "      <td>61.33</td>\n",
+       "      <td>SZ301302</td>\n",
+       "      <td>-0.032595</td>\n",
+       "      <td>3765668.56</td>\n",
+       "      <td>61.38</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1152717</th>\n",
+       "      <td>61.34</td>\n",
+       "      <td>2022-06-30 14:58:00</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>2500.0</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>SZ301302</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>153310.00</td>\n",
+       "      <td>61.34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1152718</th>\n",
+       "      <td>61.34</td>\n",
+       "      <td>2022-06-30 14:59:00</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>SZ301302</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>61.34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1152719</th>\n",
+       "      <td>61.34</td>\n",
+       "      <td>2022-06-30 15:00:00</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>78700.0</td>\n",
+       "      <td>61.34</td>\n",
+       "      <td>SZ301302</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>4827458.00</td>\n",
+       "      <td>61.34</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1152720 rows × 10 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          open                 time  close  AdjPreClose       vol    low  \\\n",
+       "0        43.80  2022-06-30 09:31:00  44.30        42.60  137348.0  43.80   \n",
+       "1        44.38  2022-06-30 09:32:00  44.32        44.30   57212.0  44.30   \n",
+       "2        44.13  2022-06-30 09:33:00  44.10        44.32   22417.0  44.05   \n",
+       "3        44.07  2022-06-30 09:34:00  44.14        44.10   43188.0  44.07   \n",
+       "4        44.14  2022-06-30 09:35:00  44.08        44.14   34658.0  44.08   \n",
+       "...        ...                  ...    ...          ...       ...    ...   \n",
+       "1152715  61.41  2022-06-30 14:56:00  61.36        61.40   58300.0  61.36   \n",
+       "1152716  61.38  2022-06-30 14:57:00  61.34        61.36   61383.0  61.33   \n",
+       "1152717  61.34  2022-06-30 14:58:00  61.34        61.34    2500.0  61.34   \n",
+       "1152718  61.34  2022-06-30 14:59:00  61.34        61.34       0.0  61.34   \n",
+       "1152719  61.34  2022-06-30 15:00:00  61.34        61.34   78700.0  61.34   \n",
+       "\n",
+       "          StockID    PctChg      amount   high  \n",
+       "0        SH688181  3.990610  6054242.00  44.49  \n",
+       "1        SH688181  0.045147  2538700.00  44.50  \n",
+       "2        SH688181 -0.496390   989985.00  44.30  \n",
+       "3        SH688181  0.090703  1906572.00  44.25  \n",
+       "4        SH688181 -0.135931  1529313.00  44.15  \n",
+       "...           ...       ...         ...    ...  \n",
+       "1152715  SZ301302 -0.065147  3579031.00  61.41  \n",
+       "1152716  SZ301302 -0.032595  3765668.56  61.38  \n",
+       "1152717  SZ301302  0.000000   153310.00  61.34  \n",
+       "1152718  SZ301302  0.000000        0.00  61.34  \n",
+       "1152719  SZ301302  0.000000  4827458.00  61.34  \n",
+       "\n",
+       "[1152720 rows x 10 columns]"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "02c05004-32f2-474a-b666-54bf6c2a0cd7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "NE430047\n",
+      "       open                 time  close  AdjPreClose      vol    low  \\\n",
+      "7680  11.93  2022-06-30 09:31:00  11.99        11.81  39942.0  11.93   \n",
+      "7681  11.99  2022-06-30 09:32:00  12.10        11.99  34679.0  11.99   \n",
+      "7682  12.10  2022-06-30 09:33:00  12.10        12.10  24770.0  12.10   \n",
+      "7683  11.86  2022-06-30 09:34:00  11.86        12.10    800.0  11.86   \n",
+      "7684  11.86  2022-06-30 09:35:00  12.00        11.86    800.0  11.86   \n",
+      "...     ...                  ...    ...          ...      ...    ...   \n",
+      "7915  11.75  2022-06-30 14:56:00  11.84        11.75    101.0  11.75   \n",
+      "7916  11.84  2022-06-30 14:57:00  11.84        11.84      0.0  11.84   \n",
+      "7917  11.84  2022-06-30 14:58:00  11.84        11.84      0.0  11.84   \n",
+      "7918  11.84  2022-06-30 14:59:00  11.84        11.84      0.0  11.84   \n",
+      "7919  11.84  2022-06-30 15:00:00  11.76        11.84   3100.0  11.76   \n",
+      "\n",
+      "       StockID    PctChg     amount   high  \n",
+      "7680  NE430047  1.524132  478045.98  12.00  \n",
+      "7681  NE430047  0.917431  419565.86  12.10  \n",
+      "7682  NE430047  0.000000  299718.68  12.13  \n",
+      "7683  NE430047 -1.983471    9488.00  11.86  \n",
+      "7684  NE430047  1.180438    9600.00  12.00  \n",
+      "...        ...       ...        ...    ...  \n",
+      "7915  NE430047  0.765957    1195.84  11.84  \n",
+      "7916  NE430047  0.000000       0.00  11.84  \n",
+      "7917  NE430047  0.000000       0.00  11.84  \n",
+      "7918  NE430047  0.000000       0.00  11.84  \n",
+      "7919  NE430047 -0.675676   36456.00  11.84  \n",
+      "\n",
+      "[240 rows x 10 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "for stock_id, df_stock in df.groupby('StockID'):\n",
+    "    print(stock_id)\n",
+    "    pprint(df_stock)\n",
+    "    break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "8cbfb47e-d6db-40e0-a761-bb8c5f25e40c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "        SetSysParam(pn_cycle(), cy_1m());\n",
+      "        SetSysParam(pn_stock(), 'SZ000019');\n",
+      "        \n",
+      "        r := select \n",
+      "            ['StockID'],\n",
+      "            DateTimeToStr(['date']) as 'time',\n",
+      "            ['open'],\n",
+      "            ['high'],\n",
+      "            ['low'],\n",
+      "            ['close'],\n",
+      "            ['vol'],\n",
+      "            ['amount'],\n",
+      "            ['yclose'] as 'AdjPreClose',\n",
+      "            (['close']/['yclose'] - 1) * 100 as 'PctChg' \n",
+      "            from markettable\n",
+      "            datekey 20220226T to 20220228T+0.999 \n",
+      "            of 'SZ000019'\n",
+      "        end;\n",
+      "        \n",
+      "        return r;\n",
+      "    \n"
+     ]
+    }
+   ],
+   "source": [
+    "code = \"\"\"\n",
+    "        SetSysParam(pn_cycle(), cy_1m());\n",
+    "        SetSysParam(pn_stock(), \\'{stock_id}\\');\n",
+    "        \n",
+    "        r := select \n",
+    "            ['StockID'],\n",
+    "            DateTimeToStr(['date']) as 'time',\n",
+    "            ['open'],\n",
+    "            ['high'],\n",
+    "            ['low'],\n",
+    "            ['close'],\n",
+    "            ['vol'],\n",
+    "            ['amount'],\n",
+    "            ['yclose'] as 'AdjPreClose',\n",
+    "            (['close']/['yclose'] - 1) * 100 as 'PctChg' \n",
+    "            from markettable\n",
+    "            datekey {start_date}T to {end_date}T+0.999 \n",
+    "            of \\'{stock_id}\\'\n",
+    "        end;\n",
+    "        \n",
+    "        return r;\n",
+    "    \"\"\".format(\n",
+    "        stock_id=stock_id,\n",
+    "        start_date=start_date,\n",
+    "        end_date=end_date\n",
+    "    )\n",
+    "print(code)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b59b8b84-ddff-4152-b083-c2d45bd196db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with tsl() as ts:\n",
+    "    r = ts.c.exec(code)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "10c48da7-826a-433a-98cc-50ef61ea5f2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(r.value())\n",
+    "if 'StockID' in df.columns and 'time' in df.columns:\n",
+    "    df.set_index(['StockID', 'time'], inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "326fae08-3bc0-4732-8bb3-d6ae2e2078f4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                              open  close  AdjPreClose       vol   low  \\\n",
+      "StockID  time                                                            \n",
+      "SZ000019 2022-02-28 09:31:00  7.59   7.56         7.55  522000.0  7.54   \n",
+      "         2022-02-28 09:32:00  7.57   7.54         7.56  159400.0  7.51   \n",
+      "         2022-02-28 09:33:00  7.54   7.57         7.54  179300.0  7.52   \n",
+      "         2022-02-28 09:34:00  7.57   7.55         7.57  122800.0  7.54   \n",
+      "         2022-02-28 09:35:00  7.55   7.60         7.55  410600.0  7.55   \n",
+      "...                            ...    ...          ...       ...   ...   \n",
+      "         2022-02-28 14:56:00  7.66   7.65         7.66   85936.0  7.65   \n",
+      "         2022-02-28 14:57:00  7.66   7.66         7.65   59100.0  7.65   \n",
+      "         2022-02-28 14:58:00  7.66   7.66         7.66       0.0  7.66   \n",
+      "         2022-02-28 14:59:00  7.66   7.66         7.66       0.0  7.66   \n",
+      "         2022-02-28 15:00:00  7.66   7.65         7.66  131320.0  7.65   \n",
+      "\n",
+      "                                PctChg     amount  high  \n",
+      "StockID  time                                            \n",
+      "SZ000019 2022-02-28 09:31:00  0.132450  3955443.0  7.65  \n",
+      "         2022-02-28 09:32:00 -0.264550  1202779.0  7.57  \n",
+      "         2022-02-28 09:33:00  0.397878  1354210.0  7.59  \n",
+      "         2022-02-28 09:34:00 -0.264201   928471.0  7.57  \n",
+      "         2022-02-28 09:35:00  0.662252  3105064.0  7.60  \n",
+      "...                                ...        ...   ...  \n",
+      "         2022-02-28 14:56:00 -0.130548   657771.4  7.66  \n",
+      "         2022-02-28 14:57:00  0.130719   452458.0  7.66  \n",
+      "         2022-02-28 14:58:00  0.000000        0.0  7.66  \n",
+      "         2022-02-28 14:59:00  0.000000        0.0  7.66  \n",
+      "         2022-02-28 15:00:00 -0.130548  1004598.0  7.66  \n",
+      "\n",
+      "[240 rows x 8 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pprint(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "e5616039-f4e4-49a8-9b23-f1185de6c507",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "quarter_range = [\n",
+    "    ('0101', '0331'),\n",
+    "    ('0401', '0630'),\n",
+    "    ('0701', '0930'),\n",
+    "    ('1001', '1231')\n",
+    "]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "c4bd0990-d69c-4f5a-9b97-b5891f88d061",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "year_list = range(2010, 2022)\n",
+    "date_range_list = list(product(year_list, quarter_range))\n",
+    "\n",
+    "date_range_list = [\n",
+    "        (str(year) + quarter_start, str(year) + quarter_end)\n",
+    "        for year, (quarter_start, quarter_end) in date_range_list\n",
+    "]\n",
+    "\n",
+    "date_range_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff4ec8b3-eda5-451f-a6dd-a0bd2517f9ca",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/ts2qlib-k-daily.ipynb
+++ b/ts2qlib-k-daily.ipynb
--- a/wind-api.ipynb
+++ b/wind-api.ipynb