From 320dac257b64e4a7764b84042373b815c8e4bf51 Mon Sep 17 00:00:00 2001 From: Artem Viznyuk Date: Tue, 10 Aug 2021 19:37:39 +0300 Subject: [PATCH 1/6] =?UTF-8?q?=D0=B7=D0=B0=D0=B4=D0=B0=D0=BD=D0=B8=D0=B5?= =?UTF-8?q?=20=D1=83=D1=80=D0=BE=D0=BA=D0=B0=201?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lesson01_NumPy01.ipynb | 125 ++++++ lesson01_Pandas01.ipynb | 831 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 956 insertions(+) create mode 100644 lesson01_NumPy01.ipynb create mode 100644 lesson01_Pandas01.ipynb diff --git a/lesson01_NumPy01.ipynb b/lesson01_NumPy01.ipynb new file mode 100644 index 0000000..03a6eac --- /dev/null +++ b/lesson01_NumPy01.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 46, + "id": "14f56e8d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Исходный массив :\n", + "[[ 1 2 3 3 1]\n", + " [ 6 8 11 10 7]]\n", + "Усредненный массив :\n", + "[2. 8.4]\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "a = np.array([\n", + " [1, 2, 3, 3, 1],\n", + " [6, 8, 11, 10, 7]\n", + " ])\n", + "\n", + "print(\"Исходный массив :\")\n", + "print(a)\n", + "\n", + "mean_a = a.mean(axis=1)\n", + "print(\"Усредненный массив :\")\n", + "print(mean_a)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "4744d693", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Центрированный массив:\n", + "[[-1. -2.4]\n", + " [ 0. -0.4]\n", + " [ 1. 2.6]\n", + " [ 1. 1.6]\n", + " [-1. -1.4]]\n" + ] + } + ], + "source": [ + "np.shape(a)\n", + "np.shape(mean_a)\n", + "a_centered = np.array(a.T - mean_a)\n", + "print(\"Центрированный массив:\")\n", + "print(a_centered)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "0736a48d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(5, 2)\n", + "[[-1.]\n", + " [ 0.]\n", + " [ 1.]\n", + " [ 1.]\n", + " [-1.]]\n", + "[[-2.4]\n", + " [-0.4]\n", + " [ 2.6]\n", + " [ 1.6]\n", + " [-1.4]]\n", + "скалярное произведение столбцов:\n", + "8.0\n" + ] + } + ], + "source": [ + "print(np.shape(a_centered))\n", + "\n", + "a1 = np.array(a_centered[0:, 0:1])\n", + "print(a1)\n", + "\n", + "a2 = np.array(a_centered[0:, 1:2])\n", + "print(a2)\n", + "\n", + "a_centered_sp = np.dot(a1.flatten(), a2.flatten())\n", + "print(\"скалярное произведение столбцов:\")\n", + "print(a_centered_sp)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lesson01_Pandas01.ipynb b/lesson01_Pandas01.ipynb new file mode 100644 index 0000000..19cfa47 --- /dev/null +++ b/lesson01_Pandas01.ipynb @@ -0,0 +1,831 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "8233d46b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
author_idauthor_name
01Тургенев
12Чехов
23Островский
\n", + "
" + ], + "text/plain": [ + " author_id author_name\n", + "0 1 Тургенев\n", + "1 2 Чехов\n", + "2 3 Островский" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "temp = {\n", + " \"author_id\": [1, 2, 3],\n", + " \"author_name\": ['Тургенев', 'Чехов', 'Островский']\n", + "}\n", + "\n", + "authors = pd.DataFrame(temp)\n", + "authors" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8751b448", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
author_idbook_titleprice
01Отцы и дети450
11Рудин300
21Дворянское гнездо350
32Толстый и тонкий500
42Дама с собачкой450
53Гроза370
63Таланты и поклонники290
\n", + "
" + ], + "text/plain": [ + " author_id book_title price\n", + "0 1 Отцы и дети 450\n", + "1 1 Рудин 300\n", + "2 1 Дворянское гнездо 350\n", + "3 2 Толстый и тонкий 500\n", + "4 2 Дама с собачкой 450\n", + "5 3 Гроза 370\n", + "6 3 Таланты и поклонники 290" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "temp = {\n", + " \"author_id\": [1, 1, 1, 2, 2, 3, 3],\n", + " \"book_title\": ['Отцы и дети', 'Рудин', 'Дворянское гнездо', 'Толстый и тонкий', 'Дама с собачкой', 'Гроза', 'Таланты и поклонники'],\n", + " \"price\": [450, 300, 350, 500, 450, 370, 290]\n", + "}\n", + "\n", + "book = pd.DataFrame(temp)\n", + "book" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bbdfad76", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
author_idauthor_namebook_titleprice
01ТургеневОтцы и дети450
11ТургеневРудин300
21ТургеневДворянское гнездо350
32ЧеховТолстый и тонкий500
42ЧеховДама с собачкой450
53ОстровскийГроза370
63ОстровскийТаланты и поклонники290
\n", + "
" + ], + "text/plain": [ + " author_id author_name book_title price\n", + "0 1 Тургенев Отцы и дети 450\n", + "1 1 Тургенев Рудин 300\n", + "2 1 Тургенев Дворянское гнездо 350\n", + "3 2 Чехов Толстый и тонкий 500\n", + "4 2 Чехов Дама с собачкой 450\n", + "5 3 Островский Гроза 370\n", + "6 3 Островский Таланты и поклонники 290" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "authors_price = pd.merge(authors, book, on='author_id', how='inner')\n", + "authors_price" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b10a0275", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
author_idauthor_namebook_titleprice
32ЧеховТолстый и тонкий500
01ТургеневОтцы и дети450
42ЧеховДама с собачкой450
53ОстровскийГроза370
21ТургеневДворянское гнездо350
11ТургеневРудин300
63ОстровскийТаланты и поклонники290
\n", + "
" + ], + "text/plain": [ + " author_id author_name book_title price\n", + "3 2 Чехов Толстый и тонкий 500\n", + "0 1 Тургенев Отцы и дети 450\n", + "4 2 Чехов Дама с собачкой 450\n", + "5 3 Островский Гроза 370\n", + "2 1 Тургенев Дворянское гнездо 350\n", + "1 1 Тургенев Рудин 300\n", + "6 3 Островский Таланты и поклонники 290" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "authors_price.sort_values(by=\"price\", ascending=False, inplace=True)\n", + "\n", + "authors_price" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ccb915d5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
author_idauthor_namebook_titleprice
32ЧеховТолстый и тонкий500
01ТургеневОтцы и дети450
42ЧеховДама с собачкой450
53ОстровскийГроза370
21ТургеневДворянское гнездо350
\n", + "
" + ], + "text/plain": [ + " author_id author_name book_title price\n", + "3 2 Чехов Толстый и тонкий 500\n", + "0 1 Тургенев Отцы и дети 450\n", + "4 2 Чехов Дама с собачкой 450\n", + "5 3 Островский Гроза 370\n", + "2 1 Тургенев Дворянское гнездо 350" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "top5_1 = authors_price.head(5)\n", + "top5_1" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "29963342", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3 500\n", + "0 450\n", + "4 450\n", + "5 370\n", + "2 350\n", + "Name: price, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "### или top5 в одно дейтсвие:\n", + "top5 = authors_price['price'].head(5)\n", + "top5" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "5234bc10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
author_name
Островский370
Тургенев450
Чехов500
\n", + "
" + ], + "text/plain": [ + " price\n", + "author_name \n", + "Островский 370\n", + "Тургенев 450\n", + "Чехов 500" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_price = authors_price.groupby('author_name').agg({'price':'max'})\n", + "max_price" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "818f6ae8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
author_name
Островский290
Тургенев300
Чехов450
\n", + "
" + ], + "text/plain": [ + " price\n", + "author_name \n", + "Островский 290\n", + "Тургенев 300\n", + "Чехов 450" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "min_price = authors_price.groupby('author_name').agg({'price':'min'})\n", + "min_price" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "dc9e6cb4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
author_name
Островский330.000000
Тургенев366.666667
Чехов475.000000
\n", + "
" + ], + "text/plain": [ + " price\n", + "author_name \n", + "Островский 330.000000\n", + "Тургенев 366.666667\n", + "Чехов 475.000000" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_price = authors_price.groupby('author_name').agg({'price':'mean'})\n", + "mean_price" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "8ed27e68", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
max_pricemin_Pricemean_price
author_name
Островский370290330.000000
Тургенев450300366.666667
Чехов500450475.000000
\n", + "
" + ], + "text/plain": [ + " max_price min_Price mean_price\n", + "author_name \n", + "Островский 370 290 330.000000\n", + "Тургенев 450 300 366.666667\n", + "Чехов 500 450 475.000000" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "authors_stat = pd.merge(max_price, min_price, on='author_name', how='inner')\n", + "authors_stat = pd.merge(authors_stat, mean_price, on='author_name', how='inner')\n", + "authors_stat.rename(columns={'price_x': 'max_price', 'price_y': 'min_Price', 'price': 'mean_price'}, inplace=True)\n", + "authors_stat" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ab65b92f482c6552d7051374d5db0929deb96eb6 Mon Sep 17 00:00:00 2001 From: Artem Viznyuk Date: Tue, 17 Aug 2021 14:38:29 +0300 Subject: [PATCH 2/6] =?UTF-8?q?=D0=B2=D1=8B=D0=BF=D0=BE=D0=BB=D0=BD=D0=B5?= =?UTF-8?q?=D0=BD=D0=BE=20=D0=B7=D0=B0=D0=B4=D0=B0=D0=BD=D0=B8=D0=B5=20?= =?UTF-8?q?=D1=83=D1=80=D0=BE=D0=BA=D0=B0=202?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lesson02_MathPlotLib01.ipynb | 3492 ++++++++++++++++++++++++++++++++++ 1 file changed, 3492 insertions(+) create mode 100644 lesson02_MathPlotLib01.ipynb diff --git a/lesson02_MathPlotLib01.ipynb b/lesson02_MathPlotLib01.ipynb new file mode 100644 index 0000000..361ce48 --- /dev/null +++ b/lesson02_MathPlotLib01.ipynb @@ -0,0 +1,3492 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "4f89780a", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2021-08-15T22:32:30.504005\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.4.0, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "\n", + "%matplotlib inline\n", + "%config InlineBackend.figure_format = 'svg'\n", + "\n", + "x = [1, 2, 3, 4, 5, 6, 7]\n", + "y = [3.5, 3.8, 4.2, 4.5, 5, 5.5, 7]\n", + "\n", + "plt.plot(x, y)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "797cdfd5", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2021-08-15T22:33:31.769391\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.4.0, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(x, y)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4c53e091", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2021-08-15T22:42:37.502501\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.4.0, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "t = np.linspace(0, 10, 51)\n", + "\n", + "f = np.cos(t) \n", + "plt.plot(t, f, 'green')\n", + "plt.title('График f(t)')\n", + "plt.xlabel('Значения t')\n", + "plt.ylabel('Значения f')\n", + "plt.axis([0.5, 9.5, -2.5, 2.5])\n", + "plt.show()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "dcb69571", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2021-08-16T22:44:43.325894\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.4.0, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "x = np.linspace(-3, 3, 51)\n", + "y1 = x**2\n", + "y2 = 2 * x + 0.5\n", + "y3 = -3 * x - 1.5\n", + "y4 = np.sin(x)\n", + "\n", + "fig, ax = plt.subplots(nrows=2, ncols=2)\n", + "ax = ax.flatten()\n", + "fig.subplots_adjust(wspace=0.3, hspace=0.3)\n", + "fig.set_size_inches(8, 6)\n", + "\n", + "ax[0].set_title(\"График y1\")\n", + "ax[0].plot(x, y1)\n", + "ax[0].set_xlim([-5,5])\n", + "\n", + "ax[1].set_title(\"График y2\")\n", + "ax[1].plot(x, y2)\n", + "\n", + "ax[2].set_title(\"График y3\")\n", + "ax[2].plot(x, y3)\n", + "\n", + "ax[3].set_title(\"График y4\")\n", + "ax[3].plot(x, y4)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 0e3c7170cb9080c5f9923ec5a16eb91de146ea8f Mon Sep 17 00:00:00 2001 From: Artem Viznyuk Date: Thu, 26 Aug 2021 08:35:50 +0300 Subject: [PATCH 3/6] =?UTF-8?q?=D0=B2=D1=8B=D0=BF=D0=BE=D0=BB=D0=BD=D0=B5?= =?UTF-8?q?=D0=BD=D0=BE=20=D0=B7=D0=B0=D0=B4=D0=B0=D0=BD=D0=B8=D0=B5=20?= =?UTF-8?q?=D1=83=D1=80=D0=BE=D0=BA=D0=B0=203?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lesson03_Sklearn01.ipynb | 518 +++++++++++++++++++++++++++++++++++++++ lesson03_Sklearn02.ipynb | 516 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 1034 insertions(+) create mode 100644 lesson03_Sklearn01.ipynb create mode 100644 lesson03_Sklearn02.ipynb diff --git a/lesson03_Sklearn01.ipynb b/lesson03_Sklearn01.ipynb new file mode 100644 index 0000000..69e423e --- /dev/null +++ b/lesson03_Sklearn01.ipynb @@ -0,0 +1,518 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "dd2114a1", + "metadata": {}, + "outputs": [], + "source": [ + "# загрузка основных модулей\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a4f7afe6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
..........................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.67
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.08
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.64
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.48
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.88
\n", + "

506 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + ".. ... ... ... ... ... ... ... ... ... ... \n", + "501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 \n", + "502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 \n", + "503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 \n", + "504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 \n", + "505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 \n", + "\n", + " PTRATIO B LSTAT \n", + "0 15.3 396.90 4.98 \n", + "1 17.8 396.90 9.14 \n", + "2 17.8 392.83 4.03 \n", + "3 18.7 394.63 2.94 \n", + "4 18.7 396.90 5.33 \n", + ".. ... ... ... \n", + "501 21.0 391.99 9.67 \n", + "502 21.0 396.90 9.08 \n", + "503 21.0 396.90 5.64 \n", + "504 21.0 393.45 6.48 \n", + "505 21.0 396.90 7.88 \n", + "\n", + "[506 rows x 13 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# загрузка исходных данных\n", + "\n", + "from sklearn.datasets import load_boston\n", + "boston = load_boston()\n", + "\n", + "data = boston[\"data\"]\n", + "feature_names = boston[\"feature_names\"]\n", + "target = boston[\"target\"]\n", + "\n", + "# X матрица признаков\n", + "X = pd.DataFrame(data, columns=feature_names)\n", + "\n", + "\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f74b4ec9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
024.0
121.6
234.7
333.4
436.2
......
50122.4
50220.6
50323.9
50422.0
50511.9
\n", + "

506 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " price\n", + "0 24.0\n", + "1 21.6\n", + "2 34.7\n", + "3 33.4\n", + "4 36.2\n", + ".. ...\n", + "501 22.4\n", + "502 20.6\n", + "503 23.9\n", + "504 22.0\n", + "505 11.9\n", + "\n", + "[506 rows x 1 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# y вектор результатов (цена недвижимости)\n", + "y = pd.DataFrame(target, columns=[\"price\"])\n", + "\n", + "y" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1fc8b722", + "metadata": {}, + "outputs": [], + "source": [ + "# разбиваем выборку на Обучающую и тестовую\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5fc069a9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression()" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# создаем модель линейной регрессии \n", + "\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "lr = LinearRegression()\n", + "\n", + "lr.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4e801d30", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(152, 1)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# делаем предсказание для тестовых данных\n", + "\n", + "y_pred = lr.predict(X_test)\n", + "\n", + "y_pred.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3054c3ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7112260057484908" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# расчет параметра детерминации r2\n", + "from sklearn.metrics import r2_score\n", + "r2 = r2_score(y_test, y_pred)\n", + "r2" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lesson03_Sklearn02.ipynb b/lesson03_Sklearn02.ipynb new file mode 100644 index 0000000..3e1002e --- /dev/null +++ b/lesson03_Sklearn02.ipynb @@ -0,0 +1,516 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f55ba795", + "metadata": {}, + "outputs": [], + "source": [ + "# загрузка основных модулей\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "06353144", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
..........................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.67
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.08
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.64
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.48
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.88
\n", + "

506 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + ".. ... ... ... ... ... ... ... ... ... ... \n", + "501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 \n", + "502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 \n", + "503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 \n", + "504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 \n", + "505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 \n", + "\n", + " PTRATIO B LSTAT \n", + "0 15.3 396.90 4.98 \n", + "1 17.8 396.90 9.14 \n", + "2 17.8 392.83 4.03 \n", + "3 18.7 394.63 2.94 \n", + "4 18.7 396.90 5.33 \n", + ".. ... ... ... \n", + "501 21.0 391.99 9.67 \n", + "502 21.0 396.90 9.08 \n", + "503 21.0 396.90 5.64 \n", + "504 21.0 393.45 6.48 \n", + "505 21.0 396.90 7.88 \n", + "\n", + "[506 rows x 13 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# загрузка исходных данных\n", + "\n", + "from sklearn.datasets import load_boston\n", + "boston = load_boston()\n", + "\n", + "data = boston[\"data\"]\n", + "feature_names = boston[\"feature_names\"]\n", + "target = boston[\"target\"]\n", + "\n", + "# X матрица признаков\n", + "X = pd.DataFrame(data, columns=feature_names)\n", + "\n", + "\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bbba2fd9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
024.0
121.6
234.7
333.4
436.2
......
50122.4
50220.6
50323.9
50422.0
50511.9
\n", + "

506 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " price\n", + "0 24.0\n", + "1 21.6\n", + "2 34.7\n", + "3 33.4\n", + "4 36.2\n", + ".. ...\n", + "501 22.4\n", + "502 20.6\n", + "503 23.9\n", + "504 22.0\n", + "505 11.9\n", + "\n", + "[506 rows x 1 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# y вектор результатов (цена недвижимости)\n", + "y = pd.DataFrame(target, columns=[\"price\"])\n", + "\n", + "y" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a39b337c", + "metadata": {}, + "outputs": [], + "source": [ + "# разбиваем выборку на Обучающую и тестовую\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "58280abb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# создаем модель\n", + "\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "model = RandomForestRegressor(n_estimators = 1000, max_depth = 12, random_state = 42)\n", + "\n", + "model.fit(X_train, y_train.values[:, 0])\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1f8fe75a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(152,)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# делаем предсказание для тестовых данных\n", + "\n", + "y_pred = model.predict(X_test)\n", + "\n", + "y_pred.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d9589f83", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.87472606157312" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# расчет параметра детерминации r2\n", + "from sklearn.metrics import r2_score\n", + "r2 = r2_score(y_test, y_pred)\n", + "r2" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From e7fd746be8aeb0516b353cf5a71c6c5785806ecd Mon Sep 17 00:00:00 2001 From: Artem Viznyuk Date: Mon, 30 Aug 2021 23:50:39 +0300 Subject: [PATCH 4/6] =?UTF-8?q?=D0=B2=D1=8B=D0=BF=D0=BE=D0=BB=D0=BD=D0=B5?= =?UTF-8?q?=D0=BD=D0=BE=20=D0=B7=D0=B0=D0=B4=D0=B0=D0=BD=D0=B8=D0=B5=20?= =?UTF-8?q?=D1=83=D1=80=D0=BE=D0=BA=D0=B0=204?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lesson04_ScikitLearn01.ipynb | 1309 +++++++++++++++++++ lesson04_ScikitLearn02.ipynb | 2289 ++++++++++++++++++++++++++++++++++ 2 files changed, 3598 insertions(+) create mode 100644 lesson04_ScikitLearn01.ipynb create mode 100644 lesson04_ScikitLearn02.ipynb diff --git a/lesson04_ScikitLearn01.ipynb b/lesson04_ScikitLearn01.ipynb new file mode 100644 index 0000000..bf53e7b --- /dev/null +++ b/lesson04_ScikitLearn01.ipynb @@ -0,0 +1,1309 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 19, + "id": "75fd2fd0", + "metadata": {}, + "outputs": [], + "source": [ + "# загрузка основных модулей\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.manifold import TSNE\n", + "\n", + "plt.style.use('fivethirtyeight')\n", + "\n", + "%config InlineBackend.figure_format = 'svg'\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f251d2f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
..........................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.67
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.08
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.64
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.48
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.88
\n", + "

506 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + ".. ... ... ... ... ... ... ... ... ... ... \n", + "501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 \n", + "502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 \n", + "503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 \n", + "504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 \n", + "505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 \n", + "\n", + " PTRATIO B LSTAT \n", + "0 15.3 396.90 4.98 \n", + "1 17.8 396.90 9.14 \n", + "2 17.8 392.83 4.03 \n", + "3 18.7 394.63 2.94 \n", + "4 18.7 396.90 5.33 \n", + ".. ... ... ... \n", + "501 21.0 391.99 9.67 \n", + "502 21.0 396.90 9.08 \n", + "503 21.0 396.90 5.64 \n", + "504 21.0 393.45 6.48 \n", + "505 21.0 396.90 7.88 \n", + "\n", + "[506 rows x 13 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# загрузка исходных данных\n", + "\n", + "from sklearn.datasets import load_boston\n", + "boston = load_boston()\n", + "\n", + "data = boston[\"data\"]\n", + "feature_names = boston[\"feature_names\"]\n", + "target = boston[\"target\"]\n", + "\n", + "# X матрица признаков\n", + "X = pd.DataFrame(data, columns=feature_names)\n", + "\n", + "\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "9f3d65fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
024.0
121.6
234.7
333.4
436.2
......
50122.4
50220.6
50323.9
50422.0
50511.9
\n", + "

506 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " price\n", + "0 24.0\n", + "1 21.6\n", + "2 34.7\n", + "3 33.4\n", + "4 36.2\n", + ".. ...\n", + "501 22.4\n", + "502 20.6\n", + "503 23.9\n", + "504 22.0\n", + "505 11.9\n", + "\n", + "[506 rows x 1 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# y вектор результатов (цена недвижимости)\n", + "y = pd.DataFrame(target, columns=[\"price\"])\n", + "\n", + "y" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f9b127fb", + "metadata": {}, + "outputs": [], + "source": [ + "# разбиваем выборку на Обучающую и тестовую\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5a7d0735", + "metadata": {}, + "outputs": [], + "source": [ + "scaler = StandardScaler()\n", + "\n", + "X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)\n", + "X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "f2d33b2c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "До:\t(404, 13)\n", + "После:\t(404, 2)\n" + ] + } + ], + "source": [ + "tsne = TSNE(n_components=2, learning_rate=250, random_state=42)\n", + "\n", + "X_train_tsne = tsne.fit_transform(X_train_scaled)\n", + "\n", + "print('До:\\t{}'.format(X_train_scaled.shape))\n", + "print('После:\\t{}'.format(X_train_tsne.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "145b1c10", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2021-08-30T23:21:29.663928\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.4.0, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(X_train_tsne[:, 0], X_train_tsne[:, 1])\n", + "\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lesson04_ScikitLearn02.ipynb b/lesson04_ScikitLearn02.ipynb new file mode 100644 index 0000000..6af567e --- /dev/null +++ b/lesson04_ScikitLearn02.ipynb @@ -0,0 +1,2289 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 9, + "id": "2cd41bf2", + "metadata": {}, + "outputs": [], + "source": [ + "# загрузка основных модулей\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.manifold import TSNE\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "from sklearn.cluster import KMeans\n", + "\n", + "plt.style.use('fivethirtyeight')\n", + "\n", + "%config InlineBackend.figure_format = 'svg'\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4e5351cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.98
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.14
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.03
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.94
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.33
..........................................
5010.062630.011.930.00.5736.59369.12.47861.0273.021.0391.999.67
5020.045270.011.930.00.5736.12076.72.28751.0273.021.0396.909.08
5030.060760.011.930.00.5736.97691.02.16751.0273.021.0396.905.64
5040.109590.011.930.00.5736.79489.32.38891.0273.021.0393.456.48
5050.047410.011.930.00.5736.03080.82.50501.0273.021.0396.907.88
\n", + "

506 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX \\\n", + "0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 \n", + "1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 \n", + "2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 \n", + "3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 \n", + "4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 \n", + ".. ... ... ... ... ... ... ... ... ... ... \n", + "501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 \n", + "502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 \n", + "503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 \n", + "504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 \n", + "505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 \n", + "\n", + " PTRATIO B LSTAT \n", + "0 15.3 396.90 4.98 \n", + "1 17.8 396.90 9.14 \n", + "2 17.8 392.83 4.03 \n", + "3 18.7 394.63 2.94 \n", + "4 18.7 396.90 5.33 \n", + ".. ... ... ... \n", + "501 21.0 391.99 9.67 \n", + "502 21.0 396.90 9.08 \n", + "503 21.0 396.90 5.64 \n", + "504 21.0 393.45 6.48 \n", + "505 21.0 396.90 7.88 \n", + "\n", + "[506 rows x 13 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# загрузка исходных данных\n", + "\n", + "from sklearn.datasets import load_boston\n", + "boston = load_boston()\n", + "\n", + "data = boston[\"data\"]\n", + "feature_names = boston[\"feature_names\"]\n", + "target = boston[\"target\"]\n", + "\n", + "# X матрица признаков\n", + "X = pd.DataFrame(data, columns=feature_names)\n", + "\n", + "\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2b7e223b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
price
024.0
121.6
234.7
333.4
436.2
......
50122.4
50220.6
50323.9
50422.0
50511.9
\n", + "

506 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " price\n", + "0 24.0\n", + "1 21.6\n", + "2 34.7\n", + "3 33.4\n", + "4 36.2\n", + ".. ...\n", + "501 22.4\n", + "502 20.6\n", + "503 23.9\n", + "504 22.0\n", + "505 11.9\n", + "\n", + "[506 rows x 1 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# y вектор результатов (цена недвижимости)\n", + "y = pd.DataFrame(target, columns=[\"price\"])\n", + "\n", + "y" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8cf1b14e", + "metadata": {}, + "outputs": [], + "source": [ + "# разбиваем выборку на Обучающую и тестовую\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "955444aa", + "metadata": {}, + "outputs": [], + "source": [ + "scaler = StandardScaler()\n", + "\n", + "X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)\n", + "X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "59bb4639", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "До:\t(404, 13)\n", + "После:\t(404, 2)\n" + ] + } + ], + "source": [ + "tsne = TSNE(n_components=2, learning_rate=250, random_state=42)\n", + "\n", + "X_train_tsne = tsne.fit_transform(X_train_scaled)\n", + "\n", + "print('До:\\t{}'.format(X_train_scaled.shape))\n", + "print('После:\\t{}'.format(X_train_tsne.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1d63cb6e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "\r\n", + "\r\n", + "\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " 2021-08-30T23:43:27.336088\r\n", + " image/svg+xml\r\n", + " \r\n", + " \r\n", + " Matplotlib v3.4.0, https://matplotlib.org/\r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + " \r\n", + "\r\n" + ], + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "kmeans = KMeans(n_clusters=3, random_state=42, max_iter=100)\n", + "\n", + "labels_train = kmeans.fit_predict(X_train_scaled)\n", + "\n", + "plt.scatter(X_train_tsne[:, 0], X_train_tsne[:, 1], c=labels_train)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "24785f86", + "metadata": {}, + "outputs": [], + "source": [ + "labels_test = kmeans.predict(X_test_scaled)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "54cbca40", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "price 22.796535\n", + "dtype: float64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# средняя по всему\n", + "y_train.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "321a2dba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "price 27.788372\n", + "dtype: float64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# средняя по кластеру 1\n", + "y_train[labels_train == 0].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "99c0b043", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "price 16.165354\n", + "dtype: float64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# средняя по кластеру 2\n", + "y_train[labels_train == 1].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1f1b9e8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "price 24.958115\n", + "dtype: float64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# средняя по кластеру 3\n", + "y_train[labels_train == 2].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c0d0fac6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.07356558139534884" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# среднее CRIM по кластеру 1\n", + "X_train.loc[labels_train == 0, 'CRIM'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "8afda3f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10.797028425196851" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# среднее CRIM по кластеру 2\n", + "X_train.loc[labels_train == 1, 'CRIM'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "94f8e840", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.4216602094240837" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# среднее CRIM по кластеру 3\n", + "X_train.loc[labels_train == 2, 'CRIM'].mean()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From b979153fad6cdb556d17ef8c70cb1f6f093679a1 Mon Sep 17 00:00:00 2001 From: Artem Viznyuk Date: Mon, 13 Sep 2021 22:37:28 +0300 Subject: [PATCH 5/6] =?UTF-8?q?=D0=9A=D1=83=D1=80=D1=81=D0=BE=D0=B2=D0=BE?= =?UTF-8?q?=D0=B9=20=D0=BF=D1=80=D0=BE=D0=B5=D0=BA=D1=82=20=D0=BF=D0=B5?= =?UTF-8?q?=D1=80=D0=B2=D0=B0=D1=8F=20=D0=B2=D0=B5=D1=80=D1=81=D0=B8=D1=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RealEstatePricePredictionMoscow02.ipynb | 4276 +++++++++++++++++++++++ 1 file changed, 4276 insertions(+) create mode 100644 RealEstatePricePredictionMoscow02.ipynb diff --git a/RealEstatePricePredictionMoscow02.ipynb b/RealEstatePricePredictionMoscow02.ipynb new file mode 100644 index 0000000..c8fe35f --- /dev/null +++ b/RealEstatePricePredictionMoscow02.ipynb @@ -0,0 +1,4276 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6c895a6a", + "metadata": {}, + "outputs": [], + "source": [ + "# импорт используемых библиотек\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import random\n", + "\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn.preprocessing import StandardScaler, RobustScaler\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import r2_score as r2\n", + "from sklearn.model_selection import KFold, GridSearchCV\n", + "\n", + "from datetime import datetime\n", + "\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "%matplotlib inline\n", + "\n", + "# отключаем предупреждения\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "\n", + "# устанавливаемый единый размер шрифта\n", + "matplotlib.rcParams.update({'font.size': 14})" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e639b2ff", + "metadata": {}, + "outputs": [], + "source": [ + "# функция для визуальной валидации результатов предсказания\n", + "\n", + "def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n", + " print(\"Train R2:\\t\" + str(round(r2(train_true_values, train_pred_values), 3)))\n", + " print(\"Test R2:\\t\" + str(round(r2(test_true_values, test_pred_values), 3)))\n", + " \n", + " plt.figure(figsize=(18,10))\n", + " \n", + " plt.subplot(121)\n", + " sns.scatterplot(x=train_pred_values, y=train_true_values)\n", + " plt.xlabel('Predicted values')\n", + " plt.ylabel('True values')\n", + " plt.title('Train sample prediction')\n", + " \n", + " plt.subplot(122)\n", + " sns.scatterplot(x=test_pred_values, y=test_true_values)\n", + " plt.xlabel('Predicted values')\n", + " plt.ylabel('True values')\n", + " plt.title('Test sample prediction')\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "45009ed5", + "metadata": {}, + "outputs": [], + "source": [ + "TRAIN_DATASET_PATH = 'C:/ARTEM/GeekBrains/Python4DS/RealEstatePricePredictionMoscow/train.csv'\n", + "TEST_DATASET_PATH = 'C:/ARTEM/GeekBrains/Python4DS/RealEstatePricePredictionMoscow/test.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "14b0f3fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1Ecology_2Ecology_3Social_1Social_2Social_3Healthcare_1Helthcare_2Shops_1Shops_2Price
99951260612.049.09072833.2726266.0312.019810.300323BB52103116NaN19B119367.455796
999616265272.064.30768437.0384209.0130.019770.072158BB26291NaN00A199715.148807
999727951781.029.64805716.5553635.035.019580.460556BB20438614NaN15B165953.912580
999814561211.032.33029222.3268705.039.019690.194489BB4780043125.035B171842.411855
99997202941.035.81547622.3013676.099.019750.127376BB4384293NaN39B177685.627486
\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "9995 1260 61 2.0 49.090728 33.272626 6.0 3 \n", + "9996 16265 27 2.0 64.307684 37.038420 9.0 13 \n", + "9997 2795 178 1.0 29.648057 16.555363 5.0 3 \n", + "9998 14561 21 1.0 32.330292 22.326870 5.0 3 \n", + "9999 7202 94 1.0 35.815476 22.301367 6.0 9 \n", + "\n", + " HouseFloor HouseYear Ecology_1 Ecology_2 Ecology_3 Social_1 \\\n", + "9995 12.0 1981 0.300323 B B 52 \n", + "9996 0.0 1977 0.072158 B B 2 \n", + "9997 5.0 1958 0.460556 B B 20 \n", + "9998 9.0 1969 0.194489 B B 47 \n", + "9999 9.0 1975 0.127376 B B 43 \n", + "\n", + " Social_2 Social_3 Healthcare_1 Helthcare_2 Shops_1 Shops_2 \\\n", + "9995 10311 6 NaN 1 9 B \n", + "9996 629 1 NaN 0 0 A \n", + "9997 4386 14 NaN 1 5 B \n", + "9998 8004 3 125.0 3 5 B \n", + "9999 8429 3 NaN 3 9 B \n", + "\n", + " Price \n", + "9995 119367.455796 \n", + "9996 199715.148807 \n", + "9997 165953.912580 \n", + "9998 171842.411855 \n", + "9999 177685.627486 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# загружаем обучающую выборку, проверяем данные\n", + "train_df = pd.read_csv(TRAIN_DATASET_PATH)\n", + "train_df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "18438e1c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Id int64\n", + "DistrictId int64\n", + "Rooms float64\n", + "Square float64\n", + "LifeSquare float64\n", + "KitchenSquare float64\n", + "Floor int64\n", + "HouseFloor float64\n", + "HouseYear int64\n", + "Ecology_1 float64\n", + "Ecology_2 object\n", + "Ecology_3 object\n", + "Social_1 int64\n", + "Social_2 int64\n", + "Social_3 int64\n", + "Healthcare_1 float64\n", + "Helthcare_2 int64\n", + "Shops_1 int64\n", + "Shops_2 object\n", + "Price float64\n", + "dtype: object" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# смотрим типы данных, цель уменьшить размер датасета\n", + "train_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e790eca4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1Ecology_2Ecology_3Social_1Social_2Social_3Healthcare_1Helthcare_2Shops_1Shops_2
499510379292.043.17752130.3399455.065.019620.069660BB3161194NaN12B
499616138383.093.69812294.52146510.02127.020180.060753BB1527872520.007B
499739121011.033.65672319.0032595.025.019660.038693BB28653311015.025B
49985722101.038.63515520.9762579.0814.019700.089040BB3379765NaN011B
499911004212.067.12274233.94434413.0917.020090.194489BB4780043125.035B
\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "4995 10379 29 2.0 43.177521 30.339945 5.0 6 \n", + "4996 16138 38 3.0 93.698122 94.521465 10.0 21 \n", + "4997 3912 101 1.0 33.656723 19.003259 5.0 2 \n", + "4998 5722 10 1.0 38.635155 20.976257 9.0 8 \n", + "4999 11004 21 2.0 67.122742 33.944344 13.0 9 \n", + "\n", + " HouseFloor HouseYear Ecology_1 Ecology_2 Ecology_3 Social_1 \\\n", + "4995 5.0 1962 0.069660 B B 31 \n", + "4996 27.0 2018 0.060753 B B 15 \n", + "4997 5.0 1966 0.038693 B B 28 \n", + "4998 14.0 1970 0.089040 B B 33 \n", + "4999 17.0 2009 0.194489 B B 47 \n", + "\n", + " Social_2 Social_3 Healthcare_1 Helthcare_2 Shops_1 Shops_2 \n", + "4995 6119 4 NaN 1 2 B \n", + "4996 2787 2 520.0 0 7 B \n", + "4997 6533 1 1015.0 2 5 B \n", + "4998 7976 5 NaN 0 11 B \n", + "4999 8004 3 125.0 3 5 B " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# загружаем проверочную выборку, проверяем данные\n", + "\n", + "test_df = pd.read_csv(TEST_DATASET_PATH)\n", + "test_df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ae193efd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Строк в трейне: 10000\n", + "Строк в тесте 5000\n" + ] + } + ], + "source": [ + "# размеры датасетов\n", + "\n", + "print('Строк в трейне:', train_df.shape[0])\n", + "print('Строк в тесте', test_df.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ec2ebfcf", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# проверяем размерности датасетов - обучающий нимеет на столбец больше проверочного\n", + "train_df.shape[1] - 1 == test_df.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "17395fc9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Id object\n", + "DistrictId object\n", + "Rooms float64\n", + "Square float64\n", + "LifeSquare float64\n", + "KitchenSquare float64\n", + "Floor int16\n", + "HouseFloor int16\n", + "HouseYear int16\n", + "Ecology_1 float64\n", + "Ecology_2 object\n", + "Ecology_3 object\n", + "Social_1 int64\n", + "Social_2 int64\n", + "Social_3 int64\n", + "Healthcare_1 float64\n", + "Helthcare_2 int64\n", + "Shops_1 int64\n", + "Shops_2 object\n", + "Price float64\n", + "dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# приведение типов к строкам, чтобы модель не применяла <> для этих данных\n", + "train_df['Id'] = train_df['Id'].astype(str)\n", + "train_df['DistrictId'] = train_df['DistrictId'].astype(str)\n", + "\n", + "# уменьшаем размерности этажей,этажностей, возраста\n", + "train_df['Floor'] = train_df['Floor'].astype(np.int16)\n", + "train_df['HouseFloor'] = train_df['HouseFloor'].astype(np.int16)\n", + "train_df['HouseYear'] = train_df['HouseYear'].astype(np.int16)\n", + "\n", + "\n", + "# проверяем преобразование\n", + "train_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c6491ecb", + "metadata": {}, + "outputs": [], + "source": [ + "### Исследуем данные" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5ff7bb43", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# гистограма распределения целевых значений\n", + "plt.figure(figsize = (16, 8))\n", + "\n", + "train_df['Price'].hist(bins=30)\n", + "plt.ylabel('Count')\n", + "plt.xlabel('Price')\n", + "\n", + "plt.title('Target distribution')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "07ea1861", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1Social_1Social_2Social_3Healthcare_1Helthcare_2Shops_1Price
count10000.00000010000.0000007887.00000010000.00000010000.00000010000.00000010000.00000010000.00000010000.00000010000.00000010000.0000005202.00000010000.00000010000.00000010000.000000
mean1.89050056.31577537.1996456.2733008.52670012.6094001984.7647000.11885824.6870005352.1574008.0392001142.9044601.3195004.231300214138.857399
std0.83951221.05873286.24120928.5609175.2411486.77597453.1127560.11902517.5326144006.79980323.8318751021.5172641.4936014.80634192872.293865
min0.0000001.1368590.3706190.0000001.0000000.000000-2005.0000000.0000000.000000168.0000000.0000000.0000000.0000000.00000059174.778028
25%1.00000041.77488122.7698321.0000004.0000009.0000001974.0000000.0176476.0000001564.0000000.000000350.0000000.0000001.000000153872.633942
50%2.00000052.51331032.7812606.0000007.00000013.0000001977.0000000.07542425.0000005285.0000002.000000900.0000001.0000003.000000192269.644879
75%2.00000065.90062545.1288039.00000012.00000017.0000002001.0000000.19578136.0000007227.0000005.0000001548.0000002.0000006.000000249135.462171
max19.000000641.0651937480.5921292014.00000042.000000117.0000004968.0000000.52186774.00000019083.000000141.0000004849.0000006.00000023.000000633233.466570
\n", + "
" + ], + "text/plain": [ + " Rooms Square LifeSquare KitchenSquare Floor \\\n", + "count 10000.000000 10000.000000 7887.000000 10000.000000 10000.000000 \n", + "mean 1.890500 56.315775 37.199645 6.273300 8.526700 \n", + "std 0.839512 21.058732 86.241209 28.560917 5.241148 \n", + "min 0.000000 1.136859 0.370619 0.000000 1.000000 \n", + "25% 1.000000 41.774881 22.769832 1.000000 4.000000 \n", + "50% 2.000000 52.513310 32.781260 6.000000 7.000000 \n", + "75% 2.000000 65.900625 45.128803 9.000000 12.000000 \n", + "max 19.000000 641.065193 7480.592129 2014.000000 42.000000 \n", + "\n", + " HouseFloor HouseYear Ecology_1 Social_1 Social_2 \\\n", + "count 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 \n", + "mean 12.609400 1984.764700 0.118858 24.687000 5352.157400 \n", + "std 6.775974 53.112756 0.119025 17.532614 4006.799803 \n", + "min 0.000000 -2005.000000 0.000000 0.000000 168.000000 \n", + "25% 9.000000 1974.000000 0.017647 6.000000 1564.000000 \n", + "50% 13.000000 1977.000000 0.075424 25.000000 5285.000000 \n", + "75% 17.000000 2001.000000 0.195781 36.000000 7227.000000 \n", + "max 117.000000 4968.000000 0.521867 74.000000 19083.000000 \n", + "\n", + " Social_3 Healthcare_1 Helthcare_2 Shops_1 Price \n", + "count 10000.000000 5202.000000 10000.000000 10000.000000 10000.000000 \n", + "mean 8.039200 1142.904460 1.319500 4.231300 214138.857399 \n", + "std 23.831875 1021.517264 1.493601 4.806341 92872.293865 \n", + "min 0.000000 0.000000 0.000000 0.000000 59174.778028 \n", + "25% 0.000000 350.000000 0.000000 1.000000 153872.633942 \n", + "50% 2.000000 900.000000 1.000000 3.000000 192269.644879 \n", + "75% 5.000000 1548.000000 2.000000 6.000000 249135.462171 \n", + "max 141.000000 4849.000000 6.000000 23.000000 633233.466570 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# смотрим на параметры для данных выборки\n", + "train_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "983e0cfd", + "metadata": {}, + "outputs": [], + "source": [ + "# количество комнат \n", + "# 0 - допустимое значение для студии\n", + "# площадь" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8923291f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Id', 'DistrictId', 'Ecology_2', 'Ecology_3', 'Shops_2']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.select_dtypes(include='object').columns.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "30f9ec22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "27 851\n", + "1 652\n", + "23 565\n", + "6 511\n", + "9 294\n", + " ... \n", + "199 1\n", + "117 1\n", + "207 1\n", + "209 1\n", + "174 1\n", + "Name: DistrictId, Length: 205, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['DistrictId'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "81c3d0f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "B 9903\n", + "A 97\n", + "Name: Ecology_2, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Ecology_2'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "80b59c3a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "B 9725\n", + "A 275\n", + "Name: Ecology_3, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Ecology_3'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "bb41a69d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "B 9175\n", + "A 825\n", + "Name: Shops_2, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Shops_2'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "7761c352", + "metadata": {}, + "outputs": [], + "source": [ + "# Обработка выбросов\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "20aabf1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.0 3880\n", + "1.0 3705\n", + "3.0 2235\n", + "4.0 150\n", + "5.0 18\n", + "0.0 8\n", + "10.0 2\n", + "19.0 1\n", + "6.0 1\n", + "Name: Rooms, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Rooms'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5acd823b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1Ecology_2Ecology_3Social_1Social_2Social_3Healthcare_1Helthcare_2Shops_1Shops_2Price
011809273.0115.027311NaN10.041020140.075424BB1130970NaN00B305018.871089
13013221.039.83252423.1692238.07819660.118537BB30620711183.010B177734.553407
2821513.078.34221547.67197210.021719880.025609BB3352610240.031B282078.720850
3235211.040.409907NaN1.0102219770.007122BB12640NaN01B168106.007630
413866942.064.28506738.5625179.0161619720.282798BB3386672NaN06B343995.102962
\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 NaN 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 NaN 1.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 Ecology_2 Ecology_3 Social_1 Social_2 \\\n", + "0 10 2014 0.075424 B B 11 3097 \n", + "1 8 1966 0.118537 B B 30 6207 \n", + "2 17 1988 0.025609 B B 33 5261 \n", + "3 22 1977 0.007122 B B 1 264 \n", + "4 16 1972 0.282798 B B 33 8667 \n", + "\n", + " Social_3 Healthcare_1 Helthcare_2 Shops_1 Shops_2 Price \n", + "0 0 NaN 0 0 B 305018.871089 \n", + "1 1 1183.0 1 0 B 177734.553407 \n", + "2 0 240.0 3 1 B 282078.720850 \n", + "3 0 NaN 0 1 B 168106.007630 \n", + "4 2 NaN 0 6 B 343995.102962 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# пример, где считаем выбросами количество комнат равное 0 или >=6: создаем столбец Rooms_outlier, \n", + "# где указываем недоверие к числу комнат\n", + "\n", + "#train_df['Rooms_outlier'] = 0\n", + "#train_df.loc[(train_df['Rooms'] == 0) | (train_df['Rooms'] >= 6), 'Rooms_outlier'] = 1\n", + "#train_df.loc[train_df['Rooms'] == 0, 'Rooms'] = 1\n", + "#train_df.loc[train_df['Rooms'] >= 6, 'Rooms'] = train_df['Rooms'].median()\n", + "\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "8c897557", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.0 3880\n", + "1.0 3705\n", + "3.0 2235\n", + "4.0 150\n", + "5.0 18\n", + "0.0 8\n", + "10.0 2\n", + "19.0 1\n", + "6.0 1\n", + "Name: Rooms, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Rooms'].value_counts()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "070297c7", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0 2460\n", + "8.0 1306\n", + "5.0 1169\n", + "10.0 1075\n", + "6.0 1038\n", + "9.0 843\n", + "0.0 697\n", + "7.0 609\n", + "12.0 249\n", + "11.0 233\n", + "13.0 67\n", + "14.0 51\n", + "4.0 39\n", + "15.0 31\n", + "3.0 22\n", + "16.0 16\n", + "20.0 14\n", + "17.0 12\n", + "19.0 11\n", + "18.0 6\n", + "2.0 4\n", + "22.0 3\n", + "30.0 2\n", + "43.0 2\n", + "41.0 2\n", + "112.0 2\n", + "25.0 2\n", + "51.0 2\n", + "37.0 2\n", + "58.0 2\n", + "32.0 2\n", + "21.0 1\n", + "73.0 1\n", + "75.0 1\n", + "36.0 1\n", + "27.0 1\n", + "63.0 1\n", + "1970.0 1\n", + "54.0 1\n", + "53.0 1\n", + "60.0 1\n", + "26.0 1\n", + "66.0 1\n", + "39.0 1\n", + "29.0 1\n", + "78.0 1\n", + "31.0 1\n", + "84.0 1\n", + "48.0 1\n", + "96.0 1\n", + "42.0 1\n", + "40.0 1\n", + "23.0 1\n", + "72.0 1\n", + "35.0 1\n", + "62.0 1\n", + "123.0 1\n", + "2014.0 1\n", + "Name: KitchenSquare, dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['KitchenSquare'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d0c61fac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(13.0, 0.0)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['KitchenSquare'].quantile(.975), train_df['KitchenSquare'].quantile(.025)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e64fb693", + "metadata": {}, + "outputs": [], + "source": [ + "# обрабатываем выбросы в площадях кухни\n", + "condition = (train_df['KitchenSquare'].isna()) \\\n", + " | (train_df['KitchenSquare'] > train_df['KitchenSquare'].quantile(.975))\n", + " \n", + "train_df.loc[condition, 'KitchenSquare'] = train_df['KitchenSquare'].median()\n", + "\n", + "train_df.loc[train_df['KitchenSquare'] < 3, 'KitchenSquare'] = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "4e632236", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.0 3183\n", + "8.0 1306\n", + "6.0 1227\n", + "5.0 1169\n", + "10.0 1075\n", + "9.0 843\n", + "7.0 609\n", + "12.0 249\n", + "11.0 233\n", + "13.0 67\n", + "4.0 39\n", + "Name: KitchenSquare, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['KitchenSquare'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "38abe743", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n", + " 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,\n", + " 26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 44,\n", + " 45, 47, 48, 99, 117], dtype=int16)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# исследуем этажность зданий\n", + "\n", + "train_df['HouseFloor'].sort_values().unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "03135765", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 37, 42],\n", + " dtype=int16)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# исследуем этаж\n", + "\n", + "train_df['Floor'].sort_values().unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d2b2bb71", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1825" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# находим случаи, когда этаж больше этажности здания\n", + "(train_df['Floor'] > train_df['HouseFloor']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "b5f769f4", + "metadata": {}, + "outputs": [], + "source": [ + "# добавляем признак достоверности этажа\n", + "train_df['HouseFloor_outlier'] = 0\n", + "train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1\n", + "train_df.loc[train_df['Floor'] > train_df['HouseFloor'], 'HouseFloor_outlier'] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "863234a8", + "metadata": {}, + "outputs": [], + "source": [ + "# некорректную этажность заменяем медианой\n", + "train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor'] = train_df['HouseFloor'].median()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "35b81479", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Int64Index([ 17, 19, 21, 25, 26, 33, 34, 37, 39, 44,\n", + " ...\n", + " 9913, 9921, 9930, 9931, 9938, 9953, 9960, 9968, 9970, 9975],\n", + " dtype='int64', length=1610)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# смотрим на id проблемных записей по этажам\n", + "floor_outliers = train_df.loc[train_df['Floor'] > train_df['HouseFloor']].index\n", + "floor_outliers" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "d8c40d18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PriceSquareMeter_price
count10000.00000010000.00000010000.000000
mean214138.85739956.3157753988.383777
std92872.29386521.0587323755.437224
min59174.7780281.136859208.293451
25%153872.63394241.7748812918.525328
50%192269.64487952.5133103910.052336
75%249135.46217165.9006254692.375993
max633233.466570641.065193230463.537966
\n", + "
" + ], + "text/plain": [ + " Price Square Meter_price\n", + "count 10000.000000 10000.000000 10000.000000\n", + "mean 214138.857399 56.315775 3988.383777\n", + "std 92872.293865 21.058732 3755.437224\n", + "min 59174.778028 1.136859 208.293451\n", + "25% 153872.633942 41.774881 2918.525328\n", + "50% 192269.644879 52.513310 3910.052336\n", + "75% 249135.462171 65.900625 4692.375993\n", + "max 633233.466570 641.065193 230463.537966" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Добавляем цену квадратного метра AV\n", + "train_df['Meter_price'] = 0\n", + "train_df.loc[train_df['Square'] > 0, 'Meter_price'] = train_df['Price'] / train_df['Square']\n", + "#train_df['Meter_price'].sort_values(ascending=False)\n", + "train_df[['Price', 'Square', 'Meter_price']].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "9f6e136e", + "metadata": {}, + "outputs": [], + "source": [ + "# подстановка для значений этажа\n", + "train_df.loc[floor_outliers, 'Floor'] = train_df.loc[floor_outliers, 'HouseFloor']\\\n", + " .apply(lambda x: random.randint(1, x))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "37b4218f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(train_df['Floor'] > train_df['HouseFloor']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "a83f6692", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8828 4968\n", + "6073 2020\n", + "5641 2020\n", + "2350 2020\n", + "9091 2020\n", + " ... \n", + "6250 1914\n", + "2066 1912\n", + "2795 1912\n", + "6794 1910\n", + "9163 -2005\n", + "Name: HouseYear, Length: 10000, dtype: int16" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# выводим года постройки\n", + "train_df['HouseYear'].sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "b5f4456f", + "metadata": {}, + "outputs": [], + "source": [ + "# если год постройки еще не наступал, то меняем на текущий, можно менять на медиану\n", + "train_df.loc[train_df['HouseYear'] > 2021, 'HouseYear'] = 2021\n", + "# если дом слишком древний, до 1910 - это первый разумный минимум. Заменяем медианой\n", + "train_df.loc[train_df['HouseYear'] < 1910, 'HouseYear'] = train_df['HouseYear'].median()\n", + "# год можно также предсказывать по этажности, по району и т.д.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "9c753033", + "metadata": {}, + "outputs": [], + "source": [ + "# обработка пропусков" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "f310a637", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Id 0\n", + "DistrictId 0\n", + "Rooms 0\n", + "Square 0\n", + "LifeSquare 2113\n", + "KitchenSquare 0\n", + "Floor 0\n", + "HouseFloor 0\n", + "HouseYear 0\n", + "Ecology_1 0\n", + "Ecology_2 0\n", + "Ecology_3 0\n", + "Social_1 0\n", + "Social_2 0\n", + "Social_3 0\n", + "Healthcare_1 4798\n", + "Helthcare_2 0\n", + "Shops_1 0\n", + "Shops_2 0\n", + "Price 0\n", + "HouseFloor_outlier 0\n", + "Meter_price 0\n", + "dtype: int64" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "d914ad63", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SquareLifeSquareKitchenSquare
0115.027311NaN10.0
139.83252423.1692238.0
278.34221547.67197210.0
340.409907NaN3.0
464.28506738.5625179.0
562.52846547.1038336.0
633.93875021.7729309.0
750.39181432.8932568.0
846.88789244.6281323.0
979.86747775.0851253.0
\n", + "
" + ], + "text/plain": [ + " Square LifeSquare KitchenSquare\n", + "0 115.027311 NaN 10.0\n", + "1 39.832524 23.169223 8.0\n", + "2 78.342215 47.671972 10.0\n", + "3 40.409907 NaN 3.0\n", + "4 64.285067 38.562517 9.0\n", + "5 62.528465 47.103833 6.0\n", + "6 33.938750 21.772930 9.0\n", + "7 50.391814 32.893256 8.0\n", + "8 46.887892 44.628132 3.0\n", + "9 79.867477 75.085125 3.0" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df[['Square', 'LifeSquare', 'KitchenSquare']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "b68c9268", + "metadata": {}, + "outputs": [], + "source": [ + "# если нет данных по жилой площади - добавляем признак и пытаемся расчитать\n", + "train_df['LifeSquare_nan'] = train_df['LifeSquare'].isna() * 1\n", + "\n", + "condition = (train_df['LifeSquare'].isna()) \\\n", + " & (~train_df['Square'].isna()) \\\n", + " & (~train_df['KitchenSquare'].isna())\n", + " \n", + "train_df.loc[condition, 'LifeSquare'] = train_df.loc[condition, 'Square'] \\\n", + " - train_df.loc[condition, 'KitchenSquare'] - 3" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "e1c3b780", + "metadata": {}, + "outputs": [], + "source": [ + "#train_df[['Healthcare_1','Healthcare_2']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "383b8664", + "metadata": {}, + "outputs": [], + "source": [ + "train_df.drop('Healthcare_1', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "a113fc25", + "metadata": {}, + "outputs": [], + "source": [ + "class DataPreprocessing:\n", + " \"\"\"Подготовка исходных данных\"\"\"\n", + "\n", + " def __init__(self):\n", + " \"\"\"Параметры класса\"\"\"\n", + " self.medians = None\n", + " self.kitchen_square_quantile = None\n", + " \n", + " def fit(self, X):\n", + " \"\"\"Сохранение статистик\"\"\" \n", + " # Расчет медиан\n", + " self.medians = X.median()\n", + " self.kitchen_square_quantile = X['KitchenSquare'].quantile(.975)\n", + " \n", + " def transform(self, X):\n", + " \"\"\"Трансформация данных\"\"\"\n", + "\n", + " # Rooms\n", + " X['Rooms_outlier'] = 0\n", + " X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1\n", + " \n", + " X.loc[X['Rooms'] == 0, 'Rooms'] = 1\n", + " X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms']\n", + " \n", + " # KitchenSquare\n", + " condition = (X['KitchenSquare'].isna()) \\\n", + " | (X['KitchenSquare'] > self.kitchen_square_quantile)\n", + " \n", + " X.loc[condition, 'KitchenSquare'] = self.medians['KitchenSquare']\n", + "\n", + " X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3\n", + " \n", + " # HouseFloor, Floor\n", + " X['HouseFloor_outlier'] = 0\n", + " X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1\n", + " X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_outlier'] = 1\n", + " \n", + " X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']\n", + " \n", + " floor_outliers = X.loc[X['Floor'] > X['HouseFloor']].index\n", + " X.loc[floor_outliers, 'Floor'] = X.loc[floor_outliers, 'HouseFloor']\\\n", + " .apply(lambda x: random.randint(1, x))\n", + " \n", + " # HouseYear\n", + " current_year = datetime.now().year\n", + " \n", + " X['HouseYear_outlier'] = 0\n", + " \n", + " X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1 \n", + " X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year\n", + " \n", + " # AV\n", + " #==========================================================================\n", + " X.loc[X['HouseYear'] < 1910, 'HouseYear_outlier'] = 1 \n", + " X.loc[X['HouseYear'] < 1910, 'HouseYear'] = self.medians['HouseYear']\n", + " #==========================================================================\n", + " X.loc[X['Square'].isna(), 'Square'] = self.medians['Square'] \n", + " #==========================================================================\n", + " \n", + " \n", + " # Healthcare_1\n", + " if 'Healthcare_1' in X.columns:\n", + " X.drop('Healthcare_1', axis=1, inplace=True)\n", + " \n", + " # LifeSquare\n", + " X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1\n", + " condition = (X['LifeSquare'].isna()) & \\\n", + " (~X['Square'].isna()) & \\\n", + " (~X['KitchenSquare'].isna())\n", + " \n", + " X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 3\n", + " \n", + " \n", + " X.fillna(self.medians, inplace=True)\n", + " \n", + " return X" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ff95fb13", + "metadata": {}, + "outputs": [], + "source": [ + "# построение новых признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "11e1d608", + "metadata": {}, + "outputs": [], + "source": [ + "# заменяем A/B на 0/1\n", + "binary_to_numbers = {'A': 0, 'B': 1}\n", + "\n", + "train_df['Ecology_2'] = train_df['Ecology_2'].replace(binary_to_numbers)\n", + "train_df['Ecology_3'] = train_df['Ecology_3'].replace(binary_to_numbers)\n", + "train_df['Shops_2'] = train_df['Shops_2'].replace(binary_to_numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "8a00e237", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DistrictIdDistrictSize
027851
11652
223565
36511
49294
\n", + "
" + ], + "text/plain": [ + " DistrictId DistrictSize\n", + "0 27 851\n", + "1 1 652\n", + "2 23 565\n", + "3 6 511\n", + "4 9 294" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# добавляем размер района\n", + "district_size = train_df['DistrictId'].value_counts().reset_index()\\\n", + " .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})\n", + "\n", + "district_size.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "a479ebce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1...Social_2Social_3Helthcare_2Shops_1Shops_2PriceHouseFloor_outlierMeter_priceLifeSquare_nanDistrictSize
011809273.0115.027311102.02731110.041020140.075424...30970001305018.87108902651.7082721851
13013221.039.83252423.1692238.07819660.118537...62071101177734.55340704462.045990078
2821513.078.34221547.67197210.021719880.025609...52610311282078.72085003600.5966980652
3235211.040.40990734.4099073.0102219770.007122...2640011168106.00763004160.0196701652
413866942.064.28506738.5625179.0161619720.282798...86672061343995.10296205351.088794089
\n", + "

5 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 102.027311 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 34.409907 3.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 ... Social_2 Social_3 Helthcare_2 \\\n", + "0 10 2014 0.075424 ... 3097 0 0 \n", + "1 8 1966 0.118537 ... 6207 1 1 \n", + "2 17 1988 0.025609 ... 5261 0 3 \n", + "3 22 1977 0.007122 ... 264 0 0 \n", + "4 16 1972 0.282798 ... 8667 2 0 \n", + "\n", + " Shops_1 Shops_2 Price HouseFloor_outlier Meter_price \\\n", + "0 0 1 305018.871089 0 2651.708272 \n", + "1 0 1 177734.553407 0 4462.045990 \n", + "2 1 1 282078.720850 0 3600.596698 \n", + "3 1 1 168106.007630 0 4160.019670 \n", + "4 6 1 343995.102962 0 5351.088794 \n", + "\n", + " LifeSquare_nan DistrictSize \n", + "0 1 851 \n", + "1 0 78 \n", + "2 0 652 \n", + "3 1 652 \n", + "4 0 89 \n", + "\n", + "[5 rows x 23 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# добавляем призак в датасет\n", + "train_df = train_df.merge(district_size, on='DistrictId', how='left')\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "aebec7b0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True 5069\n", + "False 4931\n", + "Name: DistrictSize, dtype: int64" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# большие районы\n", + "(train_df['DistrictSize'] > 100).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "bdc9ecf1", + "metadata": {}, + "outputs": [], + "source": [ + "train_df['IsDistrictLarge'] = (train_df['DistrictSize'] > 100).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "9bc15614", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DistrictIdRoomsMedPriceByDistrict
001.0142799.434052
102.0176210.812334
203.0300610.359425
311.0146735.671740
412.0202706.090239
\n", + "
" + ], + "text/plain": [ + " DistrictId Rooms MedPriceByDistrict\n", + "0 0 1.0 142799.434052\n", + "1 0 2.0 176210.812334\n", + "2 0 3.0 300610.359425\n", + "3 1 1.0 146735.671740\n", + "4 1 2.0 202706.090239" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# медианная стоимость по району\n", + "med_price_by_district = train_df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'median'})\\\n", + " .rename(columns={'Price':'MedPriceByDistrict'})\n", + "\n", + "med_price_by_district.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "b6129f8e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(675, 3)" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "med_price_by_district.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "0d3d34ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1...Helthcare_2Shops_1Shops_2PriceHouseFloor_outlierMeter_priceLifeSquare_nanDistrictSizeIsDistrictLargeMedPriceByDistrict
011809273.0115.027311102.02731110.041020140.075424...001305018.87108902651.70827218511213530.788443
13013221.039.83252423.1692238.07819660.118537...101177734.55340704462.0459900780146032.829834
2821513.078.34221547.67197210.021719880.025609...311282078.72085003600.59669806521244005.272380
3235211.040.40990734.4099073.0102219770.007122...011168106.00763004160.01967016521146735.671740
413866942.064.28506738.5625179.0161619720.282798...061343995.10296205351.0887940890201584.441255
\n", + "

5 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 102.027311 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 34.409907 3.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 ... Helthcare_2 Shops_1 Shops_2 \\\n", + "0 10 2014 0.075424 ... 0 0 1 \n", + "1 8 1966 0.118537 ... 1 0 1 \n", + "2 17 1988 0.025609 ... 3 1 1 \n", + "3 22 1977 0.007122 ... 0 1 1 \n", + "4 16 1972 0.282798 ... 0 6 1 \n", + "\n", + " Price HouseFloor_outlier Meter_price LifeSquare_nan \\\n", + "0 305018.871089 0 2651.708272 1 \n", + "1 177734.553407 0 4462.045990 0 \n", + "2 282078.720850 0 3600.596698 0 \n", + "3 168106.007630 0 4160.019670 1 \n", + "4 343995.102962 0 5351.088794 0 \n", + "\n", + " DistrictSize IsDistrictLarge MedPriceByDistrict \n", + "0 851 1 213530.788443 \n", + "1 78 0 146032.829834 \n", + "2 652 1 244005.272380 \n", + "3 652 1 146735.671740 \n", + "4 89 0 201584.441255 \n", + "\n", + "[5 rows x 25 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df = train_df.merge(med_price_by_district, on=['DistrictId', 'Rooms'], how='left')\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "16892db7", + "metadata": {}, + "outputs": [], + "source": [ + "# категоризация\n", + "def floor_to_cat(X):\n", + "\n", + " X['floor_cat'] = 0\n", + "\n", + " X.loc[X['Floor'] <= 3, 'floor_cat'] = 1 \n", + " X.loc[(X['Floor'] > 3) & (X['Floor'] <= 5), 'floor_cat'] = 2\n", + " X.loc[(X['Floor'] > 5) & (X['Floor'] <= 9), 'floor_cat'] = 3\n", + " X.loc[(X['Floor'] > 9) & (X['Floor'] <= 15), 'floor_cat'] = 4\n", + " X.loc[X['Floor'] > 15, 'floor_cat'] = 5\n", + "\n", + " return X\n", + "\n", + "\n", + "def floor_to_cat_pandas(X):\n", + " bins = [0, 3, 5, 9, 15, X['Floor'].max()]\n", + " X['floor_cat'] = pd.cut(X['Floor'], bins=bins, labels=False)\n", + " \n", + " X['floor_cat'].fillna(-1, inplace=True)\n", + " return X\n", + "\n", + "\n", + "def year_to_cat(X):\n", + "\n", + " X['year_cat'] = 0\n", + "\n", + " X.loc[X['HouseYear'] <= 1941, 'year_cat'] = 1\n", + " X.loc[(X['HouseYear'] > 1941) & (X['HouseYear'] <= 1945), 'year_cat'] = 2\n", + " X.loc[(X['HouseYear'] > 1945) & (X['HouseYear'] <= 1980), 'year_cat'] = 3\n", + " X.loc[(X['HouseYear'] > 1980) & (X['HouseYear'] <= 2000), 'year_cat'] = 4\n", + " X.loc[(X['HouseYear'] > 2000) & (X['HouseYear'] <= 2010), 'year_cat'] = 5\n", + " X.loc[(X['HouseYear'] > 2010), 'year_cat'] = 6\n", + "\n", + " return X\n", + "\n", + "\n", + "def year_to_cat_pandas(X):\n", + " bins = [0, 1941, 1945, 1980, 2000, 2010, X['HouseYear'].max()]\n", + " X['year_cat'] = pd.cut(X['HouseYear'], bins=bins, labels=False)\n", + " \n", + " X['year_cat'].fillna(-1, inplace=True)\n", + " return X" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "95906706", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 0\n", + "3 3\n", + "4 4\n", + " ..\n", + "9995 0\n", + "9996 3\n", + "9997 0\n", + "9998 0\n", + "9999 2\n", + "Name: Floor, Length: 10000, dtype: int64" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bins = [0, 3, 5, 9, 15, train_df['Floor'].max()]\n", + "pd.cut(train_df['Floor'], bins=bins, labels=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "e47e6ba7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 (3, 5]\n", + "1 (5, 9]\n", + "2 (0, 3]\n", + "3 (9, 15]\n", + "4 (15, 42]\n", + " ... \n", + "9995 (0, 3]\n", + "9996 (9, 15]\n", + "9997 (0, 3]\n", + "9998 (0, 3]\n", + "9999 (5, 9]\n", + "Name: Floor, Length: 10000, dtype: category\n", + "Categories (5, interval[int64, right]): [(0, 3] < (3, 5] < (5, 9] < (9, 15] < (15, 42]]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bins = [0, 3, 5, 9, 15, train_df['Floor'].max()]\n", + "pd.cut(train_df['Floor'], bins=bins)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "20ca77d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1...Shops_2PriceHouseFloor_outlierMeter_priceLifeSquare_nanDistrictSizeIsDistrictLargeMedPriceByDistrictyear_catfloor_cat
011809273.0115.027311102.02731110.041020140.075424...1305018.87108902651.70827218511213530.78844362
13013221.039.83252423.1692238.07819660.118537...1177734.55340704462.0459900780146032.82983433
2821513.078.34221547.67197210.021719880.025609...1282078.72085003600.59669806521244005.27238041
3235211.040.40990734.4099073.0102219770.007122...1168106.00763004160.01967016521146735.67174034
413866942.064.28506738.5625179.0161619720.282798...1343995.10296205351.0887940890201584.44125535
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 102.027311 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 34.409907 3.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 ... Shops_2 Price \\\n", + "0 10 2014 0.075424 ... 1 305018.871089 \n", + "1 8 1966 0.118537 ... 1 177734.553407 \n", + "2 17 1988 0.025609 ... 1 282078.720850 \n", + "3 22 1977 0.007122 ... 1 168106.007630 \n", + "4 16 1972 0.282798 ... 1 343995.102962 \n", + "\n", + " HouseFloor_outlier Meter_price LifeSquare_nan DistrictSize \\\n", + "0 0 2651.708272 1 851 \n", + "1 0 4462.045990 0 78 \n", + "2 0 3600.596698 0 652 \n", + "3 0 4160.019670 1 652 \n", + "4 0 5351.088794 0 89 \n", + "\n", + " IsDistrictLarge MedPriceByDistrict year_cat floor_cat \n", + "0 1 213530.788443 6 2 \n", + "1 0 146032.829834 3 3 \n", + "2 1 244005.272380 4 1 \n", + "3 1 146735.671740 3 4 \n", + "4 0 201584.441255 3 5 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df = year_to_cat(train_df)\n", + "train_df = floor_to_cat(train_df)\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "a3fb003c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
year_catfloor_catMedPriceByFloorYear
011357700.724533
112324303.812481
213429825.896082
314274992.472366
421467230.539057
\n", + "
" + ], + "text/plain": [ + " year_cat floor_cat MedPriceByFloorYear\n", + "0 1 1 357700.724533\n", + "1 1 2 324303.812481\n", + "2 1 3 429825.896082\n", + "3 1 4 274992.472366\n", + "4 2 1 467230.539057" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#!!! медианная цена по году и этажу\n", + "med_price_by_floor_year = train_df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\\\n", + " rename(columns={'Price':'MedPriceByFloorYear'})\n", + "med_price_by_floor_year.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "16adca39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1...PriceHouseFloor_outlierMeter_priceLifeSquare_nanDistrictSizeIsDistrictLargeMedPriceByDistrictyear_catfloor_catMedPriceByFloorYear
011809273.0115.027311102.02731110.041020140.075424...305018.87108902651.70827218511213530.78844362164803.070010
13013221.039.83252423.1692238.07819660.118537...177734.55340704462.0459900780146032.82983433192448.500518
2821513.078.34221547.67197210.021719880.025609...282078.72085003600.59669806521244005.27238041207856.713420
3235211.040.40990734.4099073.0102219770.007122...168106.00763004160.01967016521146735.67174034183469.921382
413866942.064.28506738.5625179.0161619720.282798...343995.10296205351.0887940890201584.44125535173955.556579
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 102.027311 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 34.409907 3.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 ... Price HouseFloor_outlier \\\n", + "0 10 2014 0.075424 ... 305018.871089 0 \n", + "1 8 1966 0.118537 ... 177734.553407 0 \n", + "2 17 1988 0.025609 ... 282078.720850 0 \n", + "3 22 1977 0.007122 ... 168106.007630 0 \n", + "4 16 1972 0.282798 ... 343995.102962 0 \n", + "\n", + " Meter_price LifeSquare_nan DistrictSize IsDistrictLarge \\\n", + "0 2651.708272 1 851 1 \n", + "1 4462.045990 0 78 0 \n", + "2 3600.596698 0 652 1 \n", + "3 4160.019670 1 652 1 \n", + "4 5351.088794 0 89 0 \n", + "\n", + " MedPriceByDistrict year_cat floor_cat MedPriceByFloorYear \n", + "0 213530.788443 6 2 164803.070010 \n", + "1 146032.829834 3 3 192448.500518 \n", + "2 244005.272380 4 1 207856.713420 \n", + "3 146735.671740 3 4 183469.921382 \n", + "4 201584.441255 3 5 173955.556579 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df = train_df.merge(med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "1fa55c19", + "metadata": {}, + "outputs": [], + "source": [ + "class FeatureGenetator():\n", + " \"\"\"Генерация новых фич\"\"\"\n", + " \n", + " def __init__(self):\n", + " self.DistrictId_counts = None\n", + " self.binary_to_numbers = None\n", + " self.med_price_by_district = None\n", + " self.med_price_by_floor_year = None\n", + " self.house_year_max = None\n", + " self.floor_max = None\n", + " self.district_size = None\n", + " self.meter_price = None #AV\n", + " \n", + " def fit(self, X, y=None):\n", + " \n", + " X = X.copy()\n", + " \n", + " # Binary features\n", + " self.binary_to_numbers = {'A': 0, 'B': 1}\n", + " \n", + " # DistrictID\n", + " self.district_size = X['DistrictId'].value_counts().reset_index() \\\n", + " .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})\n", + " \n", + " # Target encoding\n", + " ## District, Rooms\n", + " df = X.copy()\n", + " \n", + " if y is not None:\n", + " df['Price'] = y.values\n", + " \n", + " self.med_price_by_district = df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'median'})\\\n", + " .rename(columns={'Price':'MedPriceByDistrict'})\n", + " \n", + " self.med_price_by_district_median = self.med_price_by_district['MedPriceByDistrict'].median()\n", + " \n", + " ## floor, year\n", + " if y is not None:\n", + " self.floor_max = df['Floor'].max()\n", + " self.house_year_max = df['HouseYear'].max()\n", + " df['Price'] = y.values\n", + " df = self.floor_to_cat(df)\n", + " df = self.year_to_cat(df)\n", + " self.med_price_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\\\n", + " rename(columns={'Price':'MedPriceByFloorYear'})\n", + " self.med_price_by_floor_year_median = self.med_price_by_floor_year['MedPriceByFloorYear'].median()\n", + " \n", + " if y is not None:\n", + " df['Price'] = y.values\n", + " self.meter_price = df['Price']/df['Square'] #AV \n", + " # self.meter_price = \n", + " \n", + "\n", + " \n", + " def transform(self, X):\n", + " \n", + " # Binary features\n", + " X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers) # self.binary_to_numbers = {'A': 0, 'B': 1}\n", + " X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)\n", + " X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)\n", + " \n", + " # DistrictId, IsDistrictLarge\n", + " X = X.merge(self.district_size, on='DistrictId', how='left')\n", + " \n", + " X['new_district'] = 0\n", + " X.loc[X['DistrictSize'].isna(), 'new_district'] = 1\n", + " \n", + " X['DistrictSize'].fillna(5, inplace=True)\n", + " \n", + " X['IsDistrictLarge'] = (X['DistrictSize'] > 100).astype(int)\n", + " \n", + " # More categorical features\n", + " X = self.floor_to_cat(X) # + столбец floor_cat\n", + " X = self.year_to_cat(X) # + столбец year_cat\n", + " \n", + " # Target encoding\n", + " if self.med_price_by_district is not None:\n", + " X = X.merge(self.med_price_by_district, on=['DistrictId', 'Rooms'], how='left')\n", + " X['MedPriceByDistrict'].fillna(self.med_price_by_district_median, inplace=True)\n", + " \n", + " if self.med_price_by_floor_year is not None:\n", + " X = X.merge(self.med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')\n", + " X['MedPriceByFloorYear'].fillna(self.med_price_by_floor_year_median, inplace=True)\n", + " \n", + " # AV\n", + " X['MeterPrice'] = self.meter_price\n", + " X['MeterPrice'].fillna(3910,inplace = True)\n", + " #X.loc[(X['Square'] > 0), 'MeterPrice'] = X['Price'] / X['Square'] \n", + " \n", + " return X\n", + " \n", + " def floor_to_cat(self, X):\n", + " bins = [0, 3, 5, 9, 15, self.floor_max]\n", + " X['floor_cat'] = pd.cut(X['Floor'], bins=bins, labels=False)\n", + "\n", + " X['floor_cat'].fillna(-1, inplace=True)\n", + " return X\n", + " \n", + " def year_to_cat(self, X):\n", + " bins = [0, 1941, 1945, 1980, 2000, 2010, self.house_year_max]\n", + " X['year_cat'] = pd.cut(X['HouseYear'], bins=bins, labels=False)\n", + "\n", + " X['year_cat'].fillna(-1, inplace=True)\n", + " return X" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "6e21b269", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Id',\n", + " 'DistrictId',\n", + " 'Rooms',\n", + " 'Square',\n", + " 'LifeSquare',\n", + " 'KitchenSquare',\n", + " 'Floor',\n", + " 'HouseFloor',\n", + " 'HouseYear',\n", + " 'Ecology_1',\n", + " 'Ecology_2',\n", + " 'Ecology_3',\n", + " 'Social_1',\n", + " 'Social_2',\n", + " 'Social_3',\n", + " 'Helthcare_2',\n", + " 'Shops_1',\n", + " 'Shops_2',\n", + " 'Price',\n", + " 'HouseFloor_outlier',\n", + " 'Meter_price',\n", + " 'LifeSquare_nan',\n", + " 'DistrictSize',\n", + " 'IsDistrictLarge',\n", + " 'MedPriceByDistrict',\n", + " 'year_cat',\n", + " 'floor_cat',\n", + " 'MedPriceByFloorYear']" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# отбор призаков\n", + "train_df.columns.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "b70cf62f", + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',\n", + " 'Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 'Social_3',\n", + " 'Helthcare_2', 'Shops_1', 'Shops_2']\n", + "\n", + "new_feature_names = ['Rooms_outlier', 'HouseFloor_outlier', 'HouseYear_outlier', 'LifeSquare_nan', 'DistrictSize',\n", + " 'new_district', 'IsDistrictLarge', 'MedPriceByDistrict', 'MedPriceByFloorYear', 'MeterPrice']\n", + "\n", + "target_name = 'Price'" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "6cdc1577", + "metadata": {}, + "outputs": [], + "source": [ + "# разбиение на train и test" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "31fa43e6", + "metadata": {}, + "outputs": [], + "source": [ + "train_df = pd.read_csv(TRAIN_DATASET_PATH)\n", + "test_df = pd.read_csv(TEST_DATASET_PATH)\n", + "\n", + "X = train_df.drop(columns=target_name)\n", + "y = train_df[target_name]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "0fac92ea", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=21)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "3209cde6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((6700, 22), (3300, 22), (5000, 22))" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessor = DataPreprocessing()\n", + "preprocessor.fit(X_train)\n", + "\n", + "X_train = preprocessor.transform(X_train)\n", + "X_valid = preprocessor.transform(X_valid)\n", + "test_df = preprocessor.transform(test_df)\n", + "\n", + "X_train.shape, X_valid.shape, test_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "f9474705", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((6700, 30), (3300, 30), (5000, 30))" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_gen = FeatureGenetator()\n", + "features_gen.fit(X_train, y_train)\n", + "\n", + "X_train = features_gen.transform(X_train)\n", + "X_valid = features_gen.transform(X_valid)\n", + "test_df = features_gen.transform(test_df)\n", + "\n", + "X_train.shape, X_valid.shape, test_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "6fba65c6", + "metadata": {}, + "outputs": [], + "source": [ + "X_train = X_train[feature_names + new_feature_names]\n", + "X_valid = X_valid[feature_names + new_feature_names]\n", + "test_df = test_df[feature_names + new_feature_names]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "a67cd879", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0, 0, 0)" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.isna().sum().sum(), X_valid.isna().sum().sum(), test_df.isna().sum().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "4c9c6e00", + "metadata": {}, + "outputs": [], + "source": [ + "# построение модели" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "48adceb1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestRegressor(random_state=21)" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf_model = RandomForestRegressor(random_state=21, criterion='mse')\n", + "rf_model.fit(X_train, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "984202de", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train R2:\t0.966\n", + "Test R2:\t0.708\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + " # оценка модели\n", + "y_train_preds = rf_model.predict(X_train)\n", + "y_test_preds = rf_model.predict(X_valid)\n", + "\n", + "evaluate_preds(y_train, y_train_preds, y_valid, y_test_preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "f880df39", + "metadata": {}, + "outputs": [], + "source": [ + "# кросс валидация" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "0290cf24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.77280206, 0.72851638, 0.75368855])" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_score = cross_val_score(rf_model, X_train, y_train, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=21))\n", + "cv_score" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "5d373cf1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7516689994037202" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_score.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "b5be4638", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_nameimportance
23MedPriceByDistrict6.486684e-01
1Square1.067982e-01
2LifeSquare3.114880e-02
5HouseFloor2.124666e-02
6HouseYear2.101132e-02
12Social_31.984151e-02
20DistrictSize1.976566e-02
25MeterPrice1.767325e-02
24MedPriceByFloorYear1.759345e-02
4Floor1.538490e-02
7Ecology_11.497777e-02
3KitchenSquare1.478966e-02
11Social_21.436299e-02
10Social_11.299079e-02
14Shops_17.863885e-03
13Helthcare_25.410690e-03
0Rooms5.144853e-03
17HouseFloor_outlier1.434990e-03
9Ecology_39.522110e-04
15Shops_29.272947e-04
19LifeSquare_nan8.568115e-04
22IsDistrictLarge7.734220e-04
8Ecology_23.324003e-04
16Rooms_outlier4.981087e-05
18HouseYear_outlier2.952486e-07
21new_district0.000000e+00
\n", + "
" + ], + "text/plain": [ + " feature_name importance\n", + "23 MedPriceByDistrict 6.486684e-01\n", + "1 Square 1.067982e-01\n", + "2 LifeSquare 3.114880e-02\n", + "5 HouseFloor 2.124666e-02\n", + "6 HouseYear 2.101132e-02\n", + "12 Social_3 1.984151e-02\n", + "20 DistrictSize 1.976566e-02\n", + "25 MeterPrice 1.767325e-02\n", + "24 MedPriceByFloorYear 1.759345e-02\n", + "4 Floor 1.538490e-02\n", + "7 Ecology_1 1.497777e-02\n", + "3 KitchenSquare 1.478966e-02\n", + "11 Social_2 1.436299e-02\n", + "10 Social_1 1.299079e-02\n", + "14 Shops_1 7.863885e-03\n", + "13 Helthcare_2 5.410690e-03\n", + "0 Rooms 5.144853e-03\n", + "17 HouseFloor_outlier 1.434990e-03\n", + "9 Ecology_3 9.522110e-04\n", + "15 Shops_2 9.272947e-04\n", + "19 LifeSquare_nan 8.568115e-04\n", + "22 IsDistrictLarge 7.734220e-04\n", + "8 Ecology_2 3.324003e-04\n", + "16 Rooms_outlier 4.981087e-05\n", + "18 HouseYear_outlier 2.952486e-07\n", + "21 new_district 0.000000e+00" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# веса признаков\n", + "feature_importances = pd.DataFrame(zip(X_train.columns, rf_model.feature_importances_), \n", + " columns=['feature_name', 'importance'])\n", + "\n", + "feature_importances.sort_values(by='importance', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "7d95cb34", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5000, 26)" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# выгрузка результатов\n", + "test_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "90e14b5d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPrice
04567200000.0
15925200000.0
2960200000.0
33848200000.0
4746200000.0
\n", + "
" + ], + "text/plain": [ + " Id Price\n", + "0 4567 200000.0\n", + "1 5925 200000.0\n", + "2 960 200000.0\n", + "3 3848 200000.0\n", + "4 746 200000.0" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "submit = pd.read_csv('C:/ARTEM/GeekBrains/Python4DS/RealEstatePricePredictionMoscow/sample_submission.csv')\n", + "submit.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "2c01f0a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([166108.90509395, 127232.78534428, 141508.34570682, ...,\n", + " 148166.23205735, 189297.95637356, 250788.19751388])" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = rf_model.predict(test_df)\n", + "predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "6c9f1523", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPrice
04567166108.905094
15925127232.785344
2960141508.345707
33848170928.860105
4746175241.143352
\n", + "
" + ], + "text/plain": [ + " Id Price\n", + "0 4567 166108.905094\n", + "1 5925 127232.785344\n", + "2 960 141508.345707\n", + "3 3848 170928.860105\n", + "4 746 175241.143352" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "submit['Price'] = predictions\n", + "submit.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "e6052d38", + "metadata": {}, + "outputs": [], + "source": [ + "submit.to_csv('C:/ARTEM/GeekBrains/Python4DS/RealEstatePricePredictionMoscow/rf_submit.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 1d23a6f37141d7b9d8975c6d265844137e6e7f94 Mon Sep 17 00:00:00 2001 From: Artem Viznyuk Date: Tue, 14 Sep 2021 11:14:40 +0300 Subject: [PATCH 6/6] =?UTF-8?q?=D0=9A=D1=83=D1=80=D1=81=D0=BE=D0=B2=D0=BE?= =?UTF-8?q?=D0=B9=20=D0=BF=D1=80=D0=BE=D0=B5=D0=BA=D1=82=20=D0=B2=D1=82?= =?UTF-8?q?=D0=BE=D1=80=D0=B0=D1=8F=20=D0=B2=D0=B5=D1=80=D1=81=D0=B8=D1=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- RealEstatePricePredictionMoscow03.ipynb | 4304 +++++++++++++++++++++++ 1 file changed, 4304 insertions(+) create mode 100644 RealEstatePricePredictionMoscow03.ipynb diff --git a/RealEstatePricePredictionMoscow03.ipynb b/RealEstatePricePredictionMoscow03.ipynb new file mode 100644 index 0000000..3056866 --- /dev/null +++ b/RealEstatePricePredictionMoscow03.ipynb @@ -0,0 +1,4304 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6c895a6a", + "metadata": {}, + "outputs": [], + "source": [ + "# импорт используемых библиотек\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import random\n", + "\n", + "from sklearn.model_selection import train_test_split, cross_val_score\n", + "from sklearn.preprocessing import StandardScaler, RobustScaler\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.metrics import r2_score as r2\n", + "from sklearn.model_selection import KFold, GridSearchCV\n", + "\n", + "from datetime import datetime\n", + "\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "%matplotlib inline\n", + "\n", + "# отключаем предупреждения\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "\n", + "# устанавливаемый единый размер шрифта\n", + "matplotlib.rcParams.update({'font.size': 14})" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e639b2ff", + "metadata": {}, + "outputs": [], + "source": [ + "# функция для визуальной валидации результатов предсказания\n", + "\n", + "def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):\n", + " print(\"Train R2:\\t\" + str(round(r2(train_true_values, train_pred_values), 3)))\n", + " print(\"Test R2:\\t\" + str(round(r2(test_true_values, test_pred_values), 3)))\n", + " \n", + " plt.figure(figsize=(18,10))\n", + " \n", + " plt.subplot(121)\n", + " sns.scatterplot(x=train_pred_values, y=train_true_values)\n", + " plt.xlabel('Predicted values')\n", + " plt.ylabel('True values')\n", + " plt.title('Train sample prediction')\n", + " \n", + " plt.subplot(122)\n", + " sns.scatterplot(x=test_pred_values, y=test_true_values)\n", + " plt.xlabel('Predicted values')\n", + " plt.ylabel('True values')\n", + " plt.title('Test sample prediction')\n", + "\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "45009ed5", + "metadata": {}, + "outputs": [], + "source": [ + "TRAIN_DATASET_PATH = 'C:/ARTEM/GeekBrains/Python4DS/RealEstatePricePredictionMoscow/train.csv'\n", + "TEST_DATASET_PATH = 'C:/ARTEM/GeekBrains/Python4DS/RealEstatePricePredictionMoscow/test.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "14b0f3fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1Ecology_2Ecology_3Social_1Social_2Social_3Healthcare_1Helthcare_2Shops_1Shops_2Price
99951260612.049.09072833.2726266.0312.019810.300323BB52103116NaN19B119367.455796
999616265272.064.30768437.0384209.0130.019770.072158BB26291NaN00A199715.148807
999727951781.029.64805716.5553635.035.019580.460556BB20438614NaN15B165953.912580
999814561211.032.33029222.3268705.039.019690.194489BB4780043125.035B171842.411855
99997202941.035.81547622.3013676.099.019750.127376BB4384293NaN39B177685.627486
\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "9995 1260 61 2.0 49.090728 33.272626 6.0 3 \n", + "9996 16265 27 2.0 64.307684 37.038420 9.0 13 \n", + "9997 2795 178 1.0 29.648057 16.555363 5.0 3 \n", + "9998 14561 21 1.0 32.330292 22.326870 5.0 3 \n", + "9999 7202 94 1.0 35.815476 22.301367 6.0 9 \n", + "\n", + " HouseFloor HouseYear Ecology_1 Ecology_2 Ecology_3 Social_1 \\\n", + "9995 12.0 1981 0.300323 B B 52 \n", + "9996 0.0 1977 0.072158 B B 2 \n", + "9997 5.0 1958 0.460556 B B 20 \n", + "9998 9.0 1969 0.194489 B B 47 \n", + "9999 9.0 1975 0.127376 B B 43 \n", + "\n", + " Social_2 Social_3 Healthcare_1 Helthcare_2 Shops_1 Shops_2 \\\n", + "9995 10311 6 NaN 1 9 B \n", + "9996 629 1 NaN 0 0 A \n", + "9997 4386 14 NaN 1 5 B \n", + "9998 8004 3 125.0 3 5 B \n", + "9999 8429 3 NaN 3 9 B \n", + "\n", + " Price \n", + "9995 119367.455796 \n", + "9996 199715.148807 \n", + "9997 165953.912580 \n", + "9998 171842.411855 \n", + "9999 177685.627486 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# загружаем обучающую выборку, проверяем данные\n", + "train_df = pd.read_csv(TRAIN_DATASET_PATH)\n", + "train_df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "18438e1c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Id int64\n", + "DistrictId int64\n", + "Rooms float64\n", + "Square float64\n", + "LifeSquare float64\n", + "KitchenSquare float64\n", + "Floor int64\n", + "HouseFloor float64\n", + "HouseYear int64\n", + "Ecology_1 float64\n", + "Ecology_2 object\n", + "Ecology_3 object\n", + "Social_1 int64\n", + "Social_2 int64\n", + "Social_3 int64\n", + "Healthcare_1 float64\n", + "Helthcare_2 int64\n", + "Shops_1 int64\n", + "Shops_2 object\n", + "Price float64\n", + "dtype: object" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# смотрим типы данных, цель уменьшить размер датасета\n", + "train_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e790eca4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1Ecology_2Ecology_3Social_1Social_2Social_3Healthcare_1Helthcare_2Shops_1Shops_2
499510379292.043.17752130.3399455.065.019620.069660BB3161194NaN12B
499616138383.093.69812294.52146510.02127.020180.060753BB1527872520.007B
499739121011.033.65672319.0032595.025.019660.038693BB28653311015.025B
49985722101.038.63515520.9762579.0814.019700.089040BB3379765NaN011B
499911004212.067.12274233.94434413.0917.020090.194489BB4780043125.035B
\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "4995 10379 29 2.0 43.177521 30.339945 5.0 6 \n", + "4996 16138 38 3.0 93.698122 94.521465 10.0 21 \n", + "4997 3912 101 1.0 33.656723 19.003259 5.0 2 \n", + "4998 5722 10 1.0 38.635155 20.976257 9.0 8 \n", + "4999 11004 21 2.0 67.122742 33.944344 13.0 9 \n", + "\n", + " HouseFloor HouseYear Ecology_1 Ecology_2 Ecology_3 Social_1 \\\n", + "4995 5.0 1962 0.069660 B B 31 \n", + "4996 27.0 2018 0.060753 B B 15 \n", + "4997 5.0 1966 0.038693 B B 28 \n", + "4998 14.0 1970 0.089040 B B 33 \n", + "4999 17.0 2009 0.194489 B B 47 \n", + "\n", + " Social_2 Social_3 Healthcare_1 Helthcare_2 Shops_1 Shops_2 \n", + "4995 6119 4 NaN 1 2 B \n", + "4996 2787 2 520.0 0 7 B \n", + "4997 6533 1 1015.0 2 5 B \n", + "4998 7976 5 NaN 0 11 B \n", + "4999 8004 3 125.0 3 5 B " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# загружаем проверочную выборку, проверяем данные\n", + "\n", + "test_df = pd.read_csv(TEST_DATASET_PATH)\n", + "test_df.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ae193efd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Строк в трейне: 10000\n", + "Строк в тесте 5000\n" + ] + } + ], + "source": [ + "# размеры датасетов\n", + "\n", + "print('Строк в трейне:', train_df.shape[0])\n", + "print('Строк в тесте', test_df.shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ec2ebfcf", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# проверяем размерности датасетов - обучающий нимеет на столбец больше проверочного\n", + "train_df.shape[1] - 1 == test_df.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "17395fc9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Id object\n", + "DistrictId object\n", + "Rooms float64\n", + "Square float64\n", + "LifeSquare float64\n", + "KitchenSquare float64\n", + "Floor int16\n", + "HouseFloor int16\n", + "HouseYear int16\n", + "Ecology_1 float64\n", + "Ecology_2 object\n", + "Ecology_3 object\n", + "Social_1 int64\n", + "Social_2 int64\n", + "Social_3 int64\n", + "Healthcare_1 float64\n", + "Helthcare_2 int64\n", + "Shops_1 int64\n", + "Shops_2 object\n", + "Price float64\n", + "dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# приведение типов к строкам, чтобы модель не применяла <> для этих данных\n", + "train_df['Id'] = train_df['Id'].astype(str)\n", + "train_df['DistrictId'] = train_df['DistrictId'].astype(str)\n", + "\n", + "# уменьшаем размерности этажей,этажностей, возраста\n", + "train_df['Floor'] = train_df['Floor'].astype(np.int16)\n", + "train_df['HouseFloor'] = train_df['HouseFloor'].astype(np.int16)\n", + "train_df['HouseYear'] = train_df['HouseYear'].astype(np.int16)\n", + "\n", + "\n", + "# проверяем преобразование\n", + "train_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c6491ecb", + "metadata": {}, + "outputs": [], + "source": [ + "### Исследуем данные" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5ff7bb43", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# гистограма распределения целевых значений\n", + "plt.figure(figsize = (16, 8))\n", + "\n", + "train_df['Price'].hist(bins=30)\n", + "plt.ylabel('Count')\n", + "plt.xlabel('Price')\n", + "\n", + "plt.title('Target distribution')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "07ea1861", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1Social_1Social_2Social_3Healthcare_1Helthcare_2Shops_1Price
count10000.00000010000.0000007887.00000010000.00000010000.00000010000.00000010000.00000010000.00000010000.00000010000.00000010000.0000005202.00000010000.00000010000.00000010000.000000
mean1.89050056.31577537.1996456.2733008.52670012.6094001984.7647000.11885824.6870005352.1574008.0392001142.9044601.3195004.231300214138.857399
std0.83951221.05873286.24120928.5609175.2411486.77597453.1127560.11902517.5326144006.79980323.8318751021.5172641.4936014.80634192872.293865
min0.0000001.1368590.3706190.0000001.0000000.000000-2005.0000000.0000000.000000168.0000000.0000000.0000000.0000000.00000059174.778028
25%1.00000041.77488122.7698321.0000004.0000009.0000001974.0000000.0176476.0000001564.0000000.000000350.0000000.0000001.000000153872.633942
50%2.00000052.51331032.7812606.0000007.00000013.0000001977.0000000.07542425.0000005285.0000002.000000900.0000001.0000003.000000192269.644879
75%2.00000065.90062545.1288039.00000012.00000017.0000002001.0000000.19578136.0000007227.0000005.0000001548.0000002.0000006.000000249135.462171
max19.000000641.0651937480.5921292014.00000042.000000117.0000004968.0000000.52186774.00000019083.000000141.0000004849.0000006.00000023.000000633233.466570
\n", + "
" + ], + "text/plain": [ + " Rooms Square LifeSquare KitchenSquare Floor \\\n", + "count 10000.000000 10000.000000 7887.000000 10000.000000 10000.000000 \n", + "mean 1.890500 56.315775 37.199645 6.273300 8.526700 \n", + "std 0.839512 21.058732 86.241209 28.560917 5.241148 \n", + "min 0.000000 1.136859 0.370619 0.000000 1.000000 \n", + "25% 1.000000 41.774881 22.769832 1.000000 4.000000 \n", + "50% 2.000000 52.513310 32.781260 6.000000 7.000000 \n", + "75% 2.000000 65.900625 45.128803 9.000000 12.000000 \n", + "max 19.000000 641.065193 7480.592129 2014.000000 42.000000 \n", + "\n", + " HouseFloor HouseYear Ecology_1 Social_1 Social_2 \\\n", + "count 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 \n", + "mean 12.609400 1984.764700 0.118858 24.687000 5352.157400 \n", + "std 6.775974 53.112756 0.119025 17.532614 4006.799803 \n", + "min 0.000000 -2005.000000 0.000000 0.000000 168.000000 \n", + "25% 9.000000 1974.000000 0.017647 6.000000 1564.000000 \n", + "50% 13.000000 1977.000000 0.075424 25.000000 5285.000000 \n", + "75% 17.000000 2001.000000 0.195781 36.000000 7227.000000 \n", + "max 117.000000 4968.000000 0.521867 74.000000 19083.000000 \n", + "\n", + " Social_3 Healthcare_1 Helthcare_2 Shops_1 Price \n", + "count 10000.000000 5202.000000 10000.000000 10000.000000 10000.000000 \n", + "mean 8.039200 1142.904460 1.319500 4.231300 214138.857399 \n", + "std 23.831875 1021.517264 1.493601 4.806341 92872.293865 \n", + "min 0.000000 0.000000 0.000000 0.000000 59174.778028 \n", + "25% 0.000000 350.000000 0.000000 1.000000 153872.633942 \n", + "50% 2.000000 900.000000 1.000000 3.000000 192269.644879 \n", + "75% 5.000000 1548.000000 2.000000 6.000000 249135.462171 \n", + "max 141.000000 4849.000000 6.000000 23.000000 633233.466570 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# смотрим на параметры для данных выборки\n", + "train_df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "983e0cfd", + "metadata": {}, + "outputs": [], + "source": [ + "# количество комнат \n", + "# 0 - допустимое значение для студии\n", + "# площадь" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8923291f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Id', 'DistrictId', 'Ecology_2', 'Ecology_3', 'Shops_2']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.select_dtypes(include='object').columns.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "30f9ec22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "27 851\n", + "1 652\n", + "23 565\n", + "6 511\n", + "9 294\n", + " ... \n", + "199 1\n", + "117 1\n", + "207 1\n", + "209 1\n", + "174 1\n", + "Name: DistrictId, Length: 205, dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['DistrictId'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "81c3d0f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "B 9903\n", + "A 97\n", + "Name: Ecology_2, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Ecology_2'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "80b59c3a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "B 9725\n", + "A 275\n", + "Name: Ecology_3, dtype: int64" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Ecology_3'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "bb41a69d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "B 9175\n", + "A 825\n", + "Name: Shops_2, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Shops_2'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "7761c352", + "metadata": {}, + "outputs": [], + "source": [ + "# Обработка выбросов\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "20aabf1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.0 3880\n", + "1.0 3705\n", + "3.0 2235\n", + "4.0 150\n", + "5.0 18\n", + "0.0 8\n", + "10.0 2\n", + "19.0 1\n", + "6.0 1\n", + "Name: Rooms, dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Rooms'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5acd823b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1Ecology_2Ecology_3Social_1Social_2Social_3Healthcare_1Helthcare_2Shops_1Shops_2Price
011809273.0115.027311NaN10.041020140.075424BB1130970NaN00B305018.871089
13013221.039.83252423.1692238.07819660.118537BB30620711183.010B177734.553407
2821513.078.34221547.67197210.021719880.025609BB3352610240.031B282078.720850
3235211.040.409907NaN1.0102219770.007122BB12640NaN01B168106.007630
413866942.064.28506738.5625179.0161619720.282798BB3386672NaN06B343995.102962
\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 NaN 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 NaN 1.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 Ecology_2 Ecology_3 Social_1 Social_2 \\\n", + "0 10 2014 0.075424 B B 11 3097 \n", + "1 8 1966 0.118537 B B 30 6207 \n", + "2 17 1988 0.025609 B B 33 5261 \n", + "3 22 1977 0.007122 B B 1 264 \n", + "4 16 1972 0.282798 B B 33 8667 \n", + "\n", + " Social_3 Healthcare_1 Helthcare_2 Shops_1 Shops_2 Price \n", + "0 0 NaN 0 0 B 305018.871089 \n", + "1 1 1183.0 1 0 B 177734.553407 \n", + "2 0 240.0 3 1 B 282078.720850 \n", + "3 0 NaN 0 1 B 168106.007630 \n", + "4 2 NaN 0 6 B 343995.102962 " + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# пример, где считаем выбросами количество комнат равное 0 или >=6: создаем столбец Rooms_outlier, \n", + "# где указываем недоверие к числу комнат\n", + "\n", + "#train_df['Rooms_outlier'] = 0\n", + "#train_df.loc[(train_df['Rooms'] == 0) | (train_df['Rooms'] >= 6), 'Rooms_outlier'] = 1\n", + "#train_df.loc[train_df['Rooms'] == 0, 'Rooms'] = 1\n", + "#train_df.loc[train_df['Rooms'] >= 6, 'Rooms'] = train_df['Rooms'].median()\n", + "\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "8c897557", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2.0 3880\n", + "1.0 3705\n", + "3.0 2235\n", + "4.0 150\n", + "5.0 18\n", + "0.0 8\n", + "10.0 2\n", + "19.0 1\n", + "6.0 1\n", + "Name: Rooms, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['Rooms'].value_counts()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "070297c7", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0 2460\n", + "8.0 1306\n", + "5.0 1169\n", + "10.0 1075\n", + "6.0 1038\n", + "9.0 843\n", + "0.0 697\n", + "7.0 609\n", + "12.0 249\n", + "11.0 233\n", + "13.0 67\n", + "14.0 51\n", + "4.0 39\n", + "15.0 31\n", + "3.0 22\n", + "16.0 16\n", + "20.0 14\n", + "17.0 12\n", + "19.0 11\n", + "18.0 6\n", + "2.0 4\n", + "22.0 3\n", + "30.0 2\n", + "43.0 2\n", + "41.0 2\n", + "112.0 2\n", + "25.0 2\n", + "51.0 2\n", + "37.0 2\n", + "58.0 2\n", + "32.0 2\n", + "21.0 1\n", + "73.0 1\n", + "75.0 1\n", + "36.0 1\n", + "27.0 1\n", + "63.0 1\n", + "1970.0 1\n", + "54.0 1\n", + "53.0 1\n", + "60.0 1\n", + "26.0 1\n", + "66.0 1\n", + "39.0 1\n", + "29.0 1\n", + "78.0 1\n", + "31.0 1\n", + "84.0 1\n", + "48.0 1\n", + "96.0 1\n", + "42.0 1\n", + "40.0 1\n", + "23.0 1\n", + "72.0 1\n", + "35.0 1\n", + "62.0 1\n", + "123.0 1\n", + "2014.0 1\n", + "Name: KitchenSquare, dtype: int64" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['KitchenSquare'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d0c61fac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(13.0, 0.0)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['KitchenSquare'].quantile(.975), train_df['KitchenSquare'].quantile(.025)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e64fb693", + "metadata": {}, + "outputs": [], + "source": [ + "# обрабатываем выбросы в площадях кухни\n", + "condition = (train_df['KitchenSquare'].isna()) \\\n", + " | (train_df['KitchenSquare'] > train_df['KitchenSquare'].quantile(.975))\n", + " \n", + "train_df.loc[condition, 'KitchenSquare'] = train_df['KitchenSquare'].median()\n", + "\n", + "train_df.loc[train_df['KitchenSquare'] < 3, 'KitchenSquare'] = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "4e632236", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.0 3183\n", + "8.0 1306\n", + "6.0 1227\n", + "5.0 1169\n", + "10.0 1075\n", + "9.0 843\n", + "7.0 609\n", + "12.0 249\n", + "11.0 233\n", + "13.0 67\n", + "4.0 39\n", + "Name: KitchenSquare, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df['KitchenSquare'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "38abe743", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n", + " 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,\n", + " 26, 27, 28, 29, 30, 31, 32, 36, 37, 38, 39, 40, 44,\n", + " 45, 47, 48, 99, 117], dtype=int16)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# исследуем этажность зданий\n", + "\n", + "train_df['HouseFloor'].sort_values().unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "03135765", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,\n", + " 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 37, 42],\n", + " dtype=int16)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# исследуем этаж\n", + "\n", + "train_df['Floor'].sort_values().unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d2b2bb71", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1825" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# находим случаи, когда этаж больше этажности здания\n", + "(train_df['Floor'] > train_df['HouseFloor']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "b5f769f4", + "metadata": {}, + "outputs": [], + "source": [ + "# добавляем признак достоверности этажа\n", + "train_df['HouseFloor_outlier'] = 0\n", + "train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1\n", + "train_df.loc[train_df['Floor'] > train_df['HouseFloor'], 'HouseFloor_outlier'] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "863234a8", + "metadata": {}, + "outputs": [], + "source": [ + "# некорректную этажность заменяем медианой\n", + "train_df.loc[train_df['HouseFloor'] == 0, 'HouseFloor'] = train_df['HouseFloor'].median()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "35b81479", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Int64Index([ 17, 19, 21, 25, 26, 33, 34, 37, 39, 44,\n", + " ...\n", + " 9913, 9921, 9930, 9931, 9938, 9953, 9960, 9968, 9970, 9975],\n", + " dtype='int64', length=1610)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# смотрим на id проблемных записей по этажам\n", + "floor_outliers = train_df.loc[train_df['Floor'] > train_df['HouseFloor']].index\n", + "floor_outliers" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "e779720c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PriceSquareMeter_price
count10000.00000010000.00000010000.000000
mean214138.85739956.3157753988.383777
std92872.29386521.0587323755.437224
min59174.7780281.136859208.293451
25%153872.63394241.7748812918.525328
50%192269.64487952.5133103910.052336
75%249135.46217165.9006254692.375993
max633233.466570641.065193230463.537966
\n", + "
" + ], + "text/plain": [ + " Price Square Meter_price\n", + "count 10000.000000 10000.000000 10000.000000\n", + "mean 214138.857399 56.315775 3988.383777\n", + "std 92872.293865 21.058732 3755.437224\n", + "min 59174.778028 1.136859 208.293451\n", + "25% 153872.633942 41.774881 2918.525328\n", + "50% 192269.644879 52.513310 3910.052336\n", + "75% 249135.462171 65.900625 4692.375993\n", + "max 633233.466570 641.065193 230463.537966" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Добавляем цену квадратного метра AV\n", + "train_df['Meter_price'] = 0\n", + "train_df.loc[train_df['Square'] > 0, 'Meter_price'] = train_df['Price'] / train_df['Square']\n", + "#train_df['Meter_price'].sort_values(ascending=False)\n", + "train_df[['Price', 'Square', 'Meter_price']].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "9f6e136e", + "metadata": {}, + "outputs": [], + "source": [ + "# подстановка для значений этажа\n", + "train_df.loc[floor_outliers, 'Floor'] = train_df.loc[floor_outliers, 'HouseFloor']\\\n", + " .apply(lambda x: random.randint(1, x))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "37b4218f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(train_df['Floor'] > train_df['HouseFloor']).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "a83f6692", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8828 4968\n", + "6073 2020\n", + "5641 2020\n", + "2350 2020\n", + "9091 2020\n", + " ... \n", + "6250 1914\n", + "2066 1912\n", + "2795 1912\n", + "6794 1910\n", + "9163 -2005\n", + "Name: HouseYear, Length: 10000, dtype: int16" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# выводим года постройки\n", + "train_df['HouseYear'].sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "b5f4456f", + "metadata": {}, + "outputs": [], + "source": [ + "# если год постройки еще не наступал, то меняем на текущий, можно менять на медиану\n", + "train_df.loc[train_df['HouseYear'] > 2021, 'HouseYear'] = 2021\n", + "# если дом слишком древний, до 1910 - это первый разумный минимум. Заменяем медианой\n", + "train_df.loc[train_df['HouseYear'] < 1910, 'HouseYear'] = train_df['HouseYear'].median()\n", + "# год можно также предсказывать по этажности, по району и т.д.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "9c753033", + "metadata": {}, + "outputs": [], + "source": [ + "# обработка пропусков" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "f310a637", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Id 0\n", + "DistrictId 0\n", + "Rooms 0\n", + "Square 0\n", + "LifeSquare 2113\n", + "KitchenSquare 0\n", + "Floor 0\n", + "HouseFloor 0\n", + "HouseYear 0\n", + "Ecology_1 0\n", + "Ecology_2 0\n", + "Ecology_3 0\n", + "Social_1 0\n", + "Social_2 0\n", + "Social_3 0\n", + "Healthcare_1 4798\n", + "Helthcare_2 0\n", + "Shops_1 0\n", + "Shops_2 0\n", + "Price 0\n", + "HouseFloor_outlier 0\n", + "Meter_price 0\n", + "dtype: int64" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "d914ad63", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SquareLifeSquareKitchenSquare
0115.027311NaN10.0
139.83252423.1692238.0
278.34221547.67197210.0
340.409907NaN3.0
464.28506738.5625179.0
562.52846547.1038336.0
633.93875021.7729309.0
750.39181432.8932568.0
846.88789244.6281323.0
979.86747775.0851253.0
\n", + "
" + ], + "text/plain": [ + " Square LifeSquare KitchenSquare\n", + "0 115.027311 NaN 10.0\n", + "1 39.832524 23.169223 8.0\n", + "2 78.342215 47.671972 10.0\n", + "3 40.409907 NaN 3.0\n", + "4 64.285067 38.562517 9.0\n", + "5 62.528465 47.103833 6.0\n", + "6 33.938750 21.772930 9.0\n", + "7 50.391814 32.893256 8.0\n", + "8 46.887892 44.628132 3.0\n", + "9 79.867477 75.085125 3.0" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df[['Square', 'LifeSquare', 'KitchenSquare']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "b68c9268", + "metadata": {}, + "outputs": [], + "source": [ + "# если нет данных по жилой площади - добавляем признак и пытаемся расчитать\n", + "train_df['LifeSquare_nan'] = train_df['LifeSquare'].isna() * 1\n", + "\n", + "condition = (train_df['LifeSquare'].isna()) \\\n", + " & (~train_df['Square'].isna()) \\\n", + " & (~train_df['KitchenSquare'].isna())\n", + " \n", + "train_df.loc[condition, 'LifeSquare'] = train_df.loc[condition, 'Square'] \\\n", + " - train_df.loc[condition, 'KitchenSquare'] - 3" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "e1c3b780", + "metadata": {}, + "outputs": [], + "source": [ + "#train_df[['Healthcare_1','Healthcare_2']].head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "383b8664", + "metadata": {}, + "outputs": [], + "source": [ + "train_df.drop('Healthcare_1', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "a113fc25", + "metadata": {}, + "outputs": [], + "source": [ + "class DataPreprocessing:\n", + " \"\"\"Подготовка исходных данных\"\"\"\n", + "\n", + " def __init__(self):\n", + " \"\"\"Параметры класса\"\"\"\n", + " self.medians = None\n", + " self.kitchen_square_quantile = None\n", + " \n", + " def fit(self, X):\n", + " \"\"\"Сохранение статистик\"\"\" \n", + " # Расчет медиан\n", + " self.medians = X.median()\n", + " self.kitchen_square_quantile = X['KitchenSquare'].quantile(.975)\n", + " \n", + " def transform(self, X):\n", + " \"\"\"Трансформация данных\"\"\"\n", + "\n", + " # Rooms\n", + " X['Rooms_outlier'] = 0\n", + " X.loc[(X['Rooms'] == 0) | (X['Rooms'] >= 6), 'Rooms_outlier'] = 1\n", + " \n", + " X.loc[X['Rooms'] == 0, 'Rooms'] = 1\n", + " X.loc[X['Rooms'] >= 6, 'Rooms'] = self.medians['Rooms']\n", + " \n", + " # KitchenSquare\n", + " condition = (X['KitchenSquare'].isna()) \\\n", + " | (X['KitchenSquare'] > self.kitchen_square_quantile)\n", + " \n", + " X.loc[condition, 'KitchenSquare'] = self.medians['KitchenSquare']\n", + "\n", + " X.loc[X['KitchenSquare'] < 3, 'KitchenSquare'] = 3\n", + " \n", + " # HouseFloor, Floor\n", + " X['HouseFloor_outlier'] = 0\n", + " X.loc[X['HouseFloor'] == 0, 'HouseFloor_outlier'] = 1\n", + " X.loc[X['Floor'] > X['HouseFloor'], 'HouseFloor_outlier'] = 1\n", + " \n", + " X.loc[X['HouseFloor'] == 0, 'HouseFloor'] = self.medians['HouseFloor']\n", + " \n", + " floor_outliers = X.loc[X['Floor'] > X['HouseFloor']].index\n", + " X.loc[floor_outliers, 'Floor'] = X.loc[floor_outliers, 'HouseFloor']\\\n", + " .apply(lambda x: random.randint(1, x))\n", + " \n", + " # HouseYear\n", + " current_year = datetime.now().year\n", + " \n", + " X['HouseYear_outlier'] = 0\n", + " \n", + " X.loc[X['HouseYear'] > current_year, 'HouseYear_outlier'] = 1 \n", + " X.loc[X['HouseYear'] > current_year, 'HouseYear'] = current_year\n", + " \n", + " # AV\n", + " #==========================================================================\n", + " X.loc[X['HouseYear'] < 1910, 'HouseYear_outlier'] = 1 \n", + " X.loc[X['HouseYear'] < 1910, 'HouseYear'] = self.medians['HouseYear']\n", + " #==========================================================================\n", + " X.loc[X['Square'].isna(), 'Square'] = self.medians['Square'] \n", + " #==========================================================================\n", + " \n", + " \n", + " # Healthcare_1\n", + " if 'Healthcare_1' in X.columns:\n", + " X.drop('Healthcare_1', axis=1, inplace=True)\n", + " \n", + " # LifeSquare\n", + " X['LifeSquare_nan'] = X['LifeSquare'].isna() * 1\n", + " condition = (X['LifeSquare'].isna()) & \\\n", + " (~X['Square'].isna()) & \\\n", + " (~X['KitchenSquare'].isna())\n", + " \n", + " X.loc[condition, 'LifeSquare'] = X.loc[condition, 'Square'] - X.loc[condition, 'KitchenSquare'] - 3\n", + " \n", + " \n", + " X.fillna(self.medians, inplace=True)\n", + " \n", + " return X" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ff95fb13", + "metadata": {}, + "outputs": [], + "source": [ + "# построение новых признаков" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "11e1d608", + "metadata": {}, + "outputs": [], + "source": [ + "# заменяем A/B на 0/1\n", + "binary_to_numbers = {'A': 0, 'B': 1}\n", + "\n", + "train_df['Ecology_2'] = train_df['Ecology_2'].replace(binary_to_numbers)\n", + "train_df['Ecology_3'] = train_df['Ecology_3'].replace(binary_to_numbers)\n", + "train_df['Shops_2'] = train_df['Shops_2'].replace(binary_to_numbers)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "8a00e237", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DistrictIdDistrictSize
027851
11652
223565
36511
49294
\n", + "
" + ], + "text/plain": [ + " DistrictId DistrictSize\n", + "0 27 851\n", + "1 1 652\n", + "2 23 565\n", + "3 6 511\n", + "4 9 294" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# добавляем размер района\n", + "district_size = train_df['DistrictId'].value_counts().reset_index()\\\n", + " .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})\n", + "\n", + "district_size.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "a479ebce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1...Social_2Social_3Helthcare_2Shops_1Shops_2PriceHouseFloor_outlierMeter_priceLifeSquare_nanDistrictSize
011809273.0115.027311102.02731110.041020140.075424...30970001305018.87108902651.7082721851
13013221.039.83252423.1692238.07819660.118537...62071101177734.55340704462.045990078
2821513.078.34221547.67197210.021719880.025609...52610311282078.72085003600.5966980652
3235211.040.40990734.4099073.0102219770.007122...2640011168106.00763004160.0196701652
413866942.064.28506738.5625179.0161619720.282798...86672061343995.10296205351.088794089
\n", + "

5 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 102.027311 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 34.409907 3.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 ... Social_2 Social_3 Helthcare_2 \\\n", + "0 10 2014 0.075424 ... 3097 0 0 \n", + "1 8 1966 0.118537 ... 6207 1 1 \n", + "2 17 1988 0.025609 ... 5261 0 3 \n", + "3 22 1977 0.007122 ... 264 0 0 \n", + "4 16 1972 0.282798 ... 8667 2 0 \n", + "\n", + " Shops_1 Shops_2 Price HouseFloor_outlier Meter_price \\\n", + "0 0 1 305018.871089 0 2651.708272 \n", + "1 0 1 177734.553407 0 4462.045990 \n", + "2 1 1 282078.720850 0 3600.596698 \n", + "3 1 1 168106.007630 0 4160.019670 \n", + "4 6 1 343995.102962 0 5351.088794 \n", + "\n", + " LifeSquare_nan DistrictSize \n", + "0 1 851 \n", + "1 0 78 \n", + "2 0 652 \n", + "3 1 652 \n", + "4 0 89 \n", + "\n", + "[5 rows x 23 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# добавляем призак в датасет\n", + "train_df = train_df.merge(district_size, on='DistrictId', how='left')\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "aebec7b0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True 5069\n", + "False 4931\n", + "Name: DistrictSize, dtype: int64" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# большие районы\n", + "(train_df['DistrictSize'] > 100).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "bdc9ecf1", + "metadata": {}, + "outputs": [], + "source": [ + "train_df['IsDistrictLarge'] = (train_df['DistrictSize'] > 100).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "9bc15614", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DistrictIdRoomsMedPriceByDistrict
001.0142799.434052
102.0176210.812334
203.0300610.359425
311.0146735.671740
412.0202706.090239
\n", + "
" + ], + "text/plain": [ + " DistrictId Rooms MedPriceByDistrict\n", + "0 0 1.0 142799.434052\n", + "1 0 2.0 176210.812334\n", + "2 0 3.0 300610.359425\n", + "3 1 1.0 146735.671740\n", + "4 1 2.0 202706.090239" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# медианная стоимость по району\n", + "med_price_by_district = train_df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'median'})\\\n", + " .rename(columns={'Price':'MedPriceByDistrict'})\n", + "\n", + "med_price_by_district.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "b6129f8e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(675, 3)" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "med_price_by_district.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "0d3d34ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1...Helthcare_2Shops_1Shops_2PriceHouseFloor_outlierMeter_priceLifeSquare_nanDistrictSizeIsDistrictLargeMedPriceByDistrict
011809273.0115.027311102.02731110.041020140.075424...001305018.87108902651.70827218511213530.788443
13013221.039.83252423.1692238.07819660.118537...101177734.55340704462.0459900780146032.829834
2821513.078.34221547.67197210.021719880.025609...311282078.72085003600.59669806521244005.272380
3235211.040.40990734.4099073.0102219770.007122...011168106.00763004160.01967016521146735.671740
413866942.064.28506738.5625179.0161619720.282798...061343995.10296205351.0887940890201584.441255
\n", + "

5 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 102.027311 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 34.409907 3.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 ... Helthcare_2 Shops_1 Shops_2 \\\n", + "0 10 2014 0.075424 ... 0 0 1 \n", + "1 8 1966 0.118537 ... 1 0 1 \n", + "2 17 1988 0.025609 ... 3 1 1 \n", + "3 22 1977 0.007122 ... 0 1 1 \n", + "4 16 1972 0.282798 ... 0 6 1 \n", + "\n", + " Price HouseFloor_outlier Meter_price LifeSquare_nan \\\n", + "0 305018.871089 0 2651.708272 1 \n", + "1 177734.553407 0 4462.045990 0 \n", + "2 282078.720850 0 3600.596698 0 \n", + "3 168106.007630 0 4160.019670 1 \n", + "4 343995.102962 0 5351.088794 0 \n", + "\n", + " DistrictSize IsDistrictLarge MedPriceByDistrict \n", + "0 851 1 213530.788443 \n", + "1 78 0 146032.829834 \n", + "2 652 1 244005.272380 \n", + "3 652 1 146735.671740 \n", + "4 89 0 201584.441255 \n", + "\n", + "[5 rows x 25 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df = train_df.merge(med_price_by_district, on=['DistrictId', 'Rooms'], how='left')\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "16892db7", + "metadata": {}, + "outputs": [], + "source": [ + "# категоризация\n", + "def floor_to_cat(X):\n", + "\n", + " X['floor_cat'] = 0\n", + "\n", + " X.loc[X['Floor'] <= 3, 'floor_cat'] = 1 \n", + " X.loc[(X['Floor'] > 3) & (X['Floor'] <= 5), 'floor_cat'] = 2\n", + " X.loc[(X['Floor'] > 5) & (X['Floor'] <= 9), 'floor_cat'] = 3\n", + " X.loc[(X['Floor'] > 9) & (X['Floor'] <= 15), 'floor_cat'] = 4\n", + " X.loc[X['Floor'] > 15, 'floor_cat'] = 5\n", + "\n", + " return X\n", + "\n", + "\n", + "def floor_to_cat_pandas(X):\n", + " bins = [0, 3, 5, 9, 15, X['Floor'].max()]\n", + " X['floor_cat'] = pd.cut(X['Floor'], bins=bins, labels=False)\n", + " \n", + " X['floor_cat'].fillna(-1, inplace=True)\n", + " return X\n", + "\n", + "\n", + "def year_to_cat(X):\n", + "\n", + " X['year_cat'] = 0\n", + "\n", + " X.loc[X['HouseYear'] <= 1941, 'year_cat'] = 1\n", + " X.loc[(X['HouseYear'] > 1941) & (X['HouseYear'] <= 1945), 'year_cat'] = 2\n", + " X.loc[(X['HouseYear'] > 1945) & (X['HouseYear'] <= 1980), 'year_cat'] = 3\n", + " X.loc[(X['HouseYear'] > 1980) & (X['HouseYear'] <= 2000), 'year_cat'] = 4\n", + " X.loc[(X['HouseYear'] > 2000) & (X['HouseYear'] <= 2010), 'year_cat'] = 5\n", + " X.loc[(X['HouseYear'] > 2010), 'year_cat'] = 6\n", + "\n", + " return X\n", + "\n", + "\n", + "def year_to_cat_pandas(X):\n", + " bins = [0, 1941, 1945, 1980, 2000, 2010, X['HouseYear'].max()]\n", + " X['year_cat'] = pd.cut(X['HouseYear'], bins=bins, labels=False)\n", + " \n", + " X['year_cat'].fillna(-1, inplace=True)\n", + " return X" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "95906706", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 2\n", + "2 0\n", + "3 3\n", + "4 4\n", + " ..\n", + "9995 0\n", + "9996 3\n", + "9997 0\n", + "9998 0\n", + "9999 2\n", + "Name: Floor, Length: 10000, dtype: int64" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bins = [0, 3, 5, 9, 15, train_df['Floor'].max()]\n", + "pd.cut(train_df['Floor'], bins=bins, labels=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "e47e6ba7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 (3, 5]\n", + "1 (5, 9]\n", + "2 (0, 3]\n", + "3 (9, 15]\n", + "4 (15, 42]\n", + " ... \n", + "9995 (0, 3]\n", + "9996 (9, 15]\n", + "9997 (0, 3]\n", + "9998 (0, 3]\n", + "9999 (5, 9]\n", + "Name: Floor, Length: 10000, dtype: category\n", + "Categories (5, interval[int64, right]): [(0, 3] < (3, 5] < (5, 9] < (9, 15] < (15, 42]]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bins = [0, 3, 5, 9, 15, train_df['Floor'].max()]\n", + "pd.cut(train_df['Floor'], bins=bins)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "20ca77d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1...Shops_2PriceHouseFloor_outlierMeter_priceLifeSquare_nanDistrictSizeIsDistrictLargeMedPriceByDistrictyear_catfloor_cat
011809273.0115.027311102.02731110.041020140.075424...1305018.87108902651.70827218511213530.78844362
13013221.039.83252423.1692238.07819660.118537...1177734.55340704462.0459900780146032.82983433
2821513.078.34221547.67197210.021719880.025609...1282078.72085003600.59669806521244005.27238041
3235211.040.40990734.4099073.0102219770.007122...1168106.00763004160.01967016521146735.67174034
413866942.064.28506738.5625179.0161619720.282798...1343995.10296205351.0887940890201584.44125535
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 102.027311 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 34.409907 3.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 ... Shops_2 Price \\\n", + "0 10 2014 0.075424 ... 1 305018.871089 \n", + "1 8 1966 0.118537 ... 1 177734.553407 \n", + "2 17 1988 0.025609 ... 1 282078.720850 \n", + "3 22 1977 0.007122 ... 1 168106.007630 \n", + "4 16 1972 0.282798 ... 1 343995.102962 \n", + "\n", + " HouseFloor_outlier Meter_price LifeSquare_nan DistrictSize \\\n", + "0 0 2651.708272 1 851 \n", + "1 0 4462.045990 0 78 \n", + "2 0 3600.596698 0 652 \n", + "3 0 4160.019670 1 652 \n", + "4 0 5351.088794 0 89 \n", + "\n", + " IsDistrictLarge MedPriceByDistrict year_cat floor_cat \n", + "0 1 213530.788443 6 2 \n", + "1 0 146032.829834 3 3 \n", + "2 1 244005.272380 4 1 \n", + "3 1 146735.671740 3 4 \n", + "4 0 201584.441255 3 5 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df = year_to_cat(train_df)\n", + "train_df = floor_to_cat(train_df)\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "a3fb003c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
year_catfloor_catMedPriceByFloorYear
011374737.658654
112322905.044793
213429825.896082
314274992.472366
421467230.539057
\n", + "
" + ], + "text/plain": [ + " year_cat floor_cat MedPriceByFloorYear\n", + "0 1 1 374737.658654\n", + "1 1 2 322905.044793\n", + "2 1 3 429825.896082\n", + "3 1 4 274992.472366\n", + "4 2 1 467230.539057" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#!!! медианная цена по году и этажу\n", + "med_price_by_floor_year = train_df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\\\n", + " rename(columns={'Price':'MedPriceByFloorYear'})\n", + "med_price_by_floor_year.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "16adca39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdDistrictIdRoomsSquareLifeSquareKitchenSquareFloorHouseFloorHouseYearEcology_1...PriceHouseFloor_outlierMeter_priceLifeSquare_nanDistrictSizeIsDistrictLargeMedPriceByDistrictyear_catfloor_catMedPriceByFloorYear
011809273.0115.027311102.02731110.041020140.075424...305018.87108902651.70827218511213530.78844362161974.797753
13013221.039.83252423.1692238.07819660.118537...177734.55340704462.0459900780146032.82983433192538.872015
2821513.078.34221547.67197210.021719880.025609...282078.72085003600.59669806521244005.27238041204670.096676
3235211.040.40990734.4099073.0102219770.007122...168106.00763004160.01967016521146735.67174034183505.979560
413866942.064.28506738.5625179.0161619720.282798...343995.10296205351.0887940890201584.44125535173955.556579
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " Id DistrictId Rooms Square LifeSquare KitchenSquare Floor \\\n", + "0 11809 27 3.0 115.027311 102.027311 10.0 4 \n", + "1 3013 22 1.0 39.832524 23.169223 8.0 7 \n", + "2 8215 1 3.0 78.342215 47.671972 10.0 2 \n", + "3 2352 1 1.0 40.409907 34.409907 3.0 10 \n", + "4 13866 94 2.0 64.285067 38.562517 9.0 16 \n", + "\n", + " HouseFloor HouseYear Ecology_1 ... Price HouseFloor_outlier \\\n", + "0 10 2014 0.075424 ... 305018.871089 0 \n", + "1 8 1966 0.118537 ... 177734.553407 0 \n", + "2 17 1988 0.025609 ... 282078.720850 0 \n", + "3 22 1977 0.007122 ... 168106.007630 0 \n", + "4 16 1972 0.282798 ... 343995.102962 0 \n", + "\n", + " Meter_price LifeSquare_nan DistrictSize IsDistrictLarge \\\n", + "0 2651.708272 1 851 1 \n", + "1 4462.045990 0 78 0 \n", + "2 3600.596698 0 652 1 \n", + "3 4160.019670 1 652 1 \n", + "4 5351.088794 0 89 0 \n", + "\n", + " MedPriceByDistrict year_cat floor_cat MedPriceByFloorYear \n", + "0 213530.788443 6 2 161974.797753 \n", + "1 146032.829834 3 3 192538.872015 \n", + "2 244005.272380 4 1 204670.096676 \n", + "3 146735.671740 3 4 183505.979560 \n", + "4 201584.441255 3 5 173955.556579 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df = train_df.merge(med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')\n", + "train_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "1fa55c19", + "metadata": {}, + "outputs": [], + "source": [ + "class FeatureGenetator():\n", + " \"\"\"Генерация новых фич\"\"\"\n", + " \n", + " def __init__(self):\n", + " self.DistrictId_counts = None\n", + " self.binary_to_numbers = None\n", + " self.med_price_by_district = None\n", + " self.med_price_by_floor_year = None\n", + " self.house_year_max = None\n", + " self.floor_max = None\n", + " self.district_size = None\n", + " self.meter_price = None #AV\n", + " self.meter_price_by_district_rooms = None #AV\n", + " \n", + " def fit(self, X, y=None):\n", + " \n", + " X = X.copy()\n", + " \n", + " # Binary features\n", + " self.binary_to_numbers = {'A': 0, 'B': 1}\n", + " \n", + " # DistrictID\n", + " self.district_size = X['DistrictId'].value_counts().reset_index() \\\n", + " .rename(columns={'index':'DistrictId', 'DistrictId':'DistrictSize'})\n", + " \n", + " # Target encoding\n", + " ## District, Rooms\n", + " df = X.copy()\n", + " \n", + " if y is not None:\n", + " df['Price'] = y.values\n", + " \n", + " self.med_price_by_district = df.groupby(['DistrictId', 'Rooms'], as_index=False).agg({'Price':'median'})\\\n", + " .rename(columns={'Price':'MedPriceByDistrict'})\n", + " \n", + " self.med_price_by_district_median = self.med_price_by_district['MedPriceByDistrict'].median()\n", + " \n", + " ## floor, year\n", + " if y is not None:\n", + " self.floor_max = df['Floor'].max()\n", + " self.house_year_max = df['HouseYear'].max()\n", + " df['Price'] = y.values\n", + " df = self.floor_to_cat(df)\n", + " df = self.year_to_cat(df)\n", + " self.med_price_by_floor_year = df.groupby(['year_cat', 'floor_cat'], as_index=False).agg({'Price':'median'}).\\\n", + " rename(columns={'Price':'MedPriceByFloorYear'})\n", + " self.med_price_by_floor_year_median = self.med_price_by_floor_year['MedPriceByFloorYear'].median()\n", + " \n", + " if y is not None:\n", + " df['Price'] = y.values\n", + " self.meter_price = df['Price']/df['Square'] #AV \n", + " \n", + " \n", + " if y is not None:\n", + " df['Price'] = y.values \n", + " df = self.fill_meter_price(df)\n", + " self.meter_price_by_district_rooms = df.groupby(['DistrictId', 'Rooms'], as_index=False).\\\n", + " agg({'MeterPrice':'median'}).\\\n", + " rename(columns={'MeterPrice':'MeterPriceByDistrictRooms'})\n", + " \n", + " \n", + " \n", + " def transform(self, X):\n", + " \n", + " # Binary features\n", + " X['Ecology_2'] = X['Ecology_2'].map(self.binary_to_numbers) # self.binary_to_numbers = {'A': 0, 'B': 1}\n", + " X['Ecology_3'] = X['Ecology_3'].map(self.binary_to_numbers)\n", + " X['Shops_2'] = X['Shops_2'].map(self.binary_to_numbers)\n", + " \n", + " # DistrictId, IsDistrictLarge\n", + " X = X.merge(self.district_size, on='DistrictId', how='left')\n", + " \n", + " X['new_district'] = 0\n", + " X.loc[X['DistrictSize'].isna(), 'new_district'] = 1\n", + " \n", + " X['DistrictSize'].fillna(5, inplace=True)\n", + " \n", + " X['IsDistrictLarge'] = (X['DistrictSize'] > 100).astype(int)\n", + " \n", + " # More categorical features\n", + " X = self.floor_to_cat(X) # + столбец floor_cat\n", + " X = self.year_to_cat(X) # + столбец year_cat\n", + " \n", + " # Target encoding\n", + " if self.med_price_by_district is not None:\n", + " X = X.merge(self.med_price_by_district, on=['DistrictId', 'Rooms'], how='left')\n", + " X['MedPriceByDistrict'].fillna(self.med_price_by_district_median, inplace=True)\n", + " \n", + " if self.med_price_by_floor_year is not None:\n", + " X = X.merge(self.med_price_by_floor_year, on=['year_cat', 'floor_cat'], how='left')\n", + " X['MedPriceByFloorYear'].fillna(self.med_price_by_floor_year_median, inplace=True)\n", + " \n", + " # AV\n", + " X['MeterPrice'] = self.meter_price\n", + " X['MeterPrice'].fillna(3910,inplace = True)\n", + " #X.loc[(X['Square'] > 0), 'MeterPrice'] = X['Price'] / X['Square'] \n", + " \n", + " if self.meter_price_by_district_rooms is not None:\n", + " X = X.merge(self.meter_price_by_district_rooms, on=['DistrictId', 'Rooms'], how='left')\n", + " X['MeterPriceByDistrictRooms'].fillna(self.med_price_by_floor_year_median, inplace=True) \n", + " \n", + " \n", + " return X \n", + " \n", + " def floor_to_cat(self, X):\n", + " bins = [0, 3, 5, 9, 15, self.floor_max]\n", + " X['floor_cat'] = pd.cut(X['Floor'], bins=bins, labels=False)\n", + "\n", + " X['floor_cat'].fillna(-1, inplace=True)\n", + " return X\n", + " \n", + " def year_to_cat(self, X):\n", + " bins = [0, 1941, 1945, 1980, 2000, 2010, self.house_year_max]\n", + " X['year_cat'] = pd.cut(X['HouseYear'], bins=bins, labels=False)\n", + " X['year_cat'].fillna(-1, inplace=True)\n", + " return X\n", + " \n", + " def fill_meter_price(self, X): #AV\n", + " X['MeterPrice'] = self.meter_price\n", + " X['MeterPrice'].fillna(3910,inplace = True)\n", + " X.loc[(X['Square'] > 0), 'MeterPrice'] = X['Price'] / X['Square']\n", + " return X\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "6e21b269", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Id',\n", + " 'DistrictId',\n", + " 'Rooms',\n", + " 'Square',\n", + " 'LifeSquare',\n", + " 'KitchenSquare',\n", + " 'Floor',\n", + " 'HouseFloor',\n", + " 'HouseYear',\n", + " 'Ecology_1',\n", + " 'Ecology_2',\n", + " 'Ecology_3',\n", + " 'Social_1',\n", + " 'Social_2',\n", + " 'Social_3',\n", + " 'Helthcare_2',\n", + " 'Shops_1',\n", + " 'Shops_2',\n", + " 'Price',\n", + " 'HouseFloor_outlier',\n", + " 'Meter_price',\n", + " 'LifeSquare_nan',\n", + " 'DistrictSize',\n", + " 'IsDistrictLarge',\n", + " 'MedPriceByDistrict',\n", + " 'year_cat',\n", + " 'floor_cat',\n", + " 'MedPriceByFloorYear']" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# отбор призаков\n", + "train_df.columns.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "b70cf62f", + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',\n", + " 'Ecology_1', 'Ecology_2', 'Ecology_3', 'Social_1', 'Social_2', 'Social_3',\n", + " 'Helthcare_2', 'Shops_1', 'Shops_2']\n", + "\n", + "new_feature_names = ['Rooms_outlier', 'HouseFloor_outlier', 'HouseYear_outlier', 'LifeSquare_nan', 'DistrictSize',\n", + " 'new_district', 'IsDistrictLarge', 'MedPriceByDistrict', 'MedPriceByFloorYear', 'MeterPrice',\n", + " 'MeterPriceByDistrictRooms']\n", + "\n", + "target_name = 'Price'" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "6cdc1577", + "metadata": {}, + "outputs": [], + "source": [ + "# разбиение на train и test" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "31fa43e6", + "metadata": {}, + "outputs": [], + "source": [ + "train_df = pd.read_csv(TRAIN_DATASET_PATH)\n", + "test_df = pd.read_csv(TEST_DATASET_PATH)\n", + "\n", + "X = train_df.drop(columns=target_name)\n", + "y = train_df[target_name]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "0fac92ea", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=21)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "3209cde6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((6700, 22), (3300, 22), (5000, 22))" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preprocessor = DataPreprocessing()\n", + "preprocessor.fit(X_train)\n", + "\n", + "X_train = preprocessor.transform(X_train)\n", + "X_valid = preprocessor.transform(X_valid)\n", + "test_df = preprocessor.transform(test_df)\n", + "\n", + "X_train.shape, X_valid.shape, test_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "f9474705", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((6700, 31), (3300, 31), (5000, 31))" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features_gen = FeatureGenetator()\n", + "features_gen.fit(X_train, y_train)\n", + "\n", + "X_train = features_gen.transform(X_train)\n", + "X_valid = features_gen.transform(X_valid)\n", + "test_df = features_gen.transform(test_df)\n", + "\n", + "X_train.shape, X_valid.shape, test_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "6fba65c6", + "metadata": {}, + "outputs": [], + "source": [ + "X_train = X_train[feature_names + new_feature_names]\n", + "X_valid = X_valid[feature_names + new_feature_names]\n", + "test_df = test_df[feature_names + new_feature_names]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "a67cd879", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0, 0, 0)" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.isna().sum().sum(), X_valid.isna().sum().sum(), test_df.isna().sum().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "4c9c6e00", + "metadata": {}, + "outputs": [], + "source": [ + "# построение модели" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "48adceb1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestRegressor(random_state=21)" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rf_model = RandomForestRegressor(random_state=21, criterion='mse')\n", + "rf_model.fit(X_train, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "984202de", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train R2:\t0.967\n", + "Test R2:\t0.722\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + " # оценка модели\n", + "y_train_preds = rf_model.predict(X_train)\n", + "y_test_preds = rf_model.predict(X_valid)\n", + "\n", + "evaluate_preds(y_train, y_train_preds, y_valid, y_test_preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "f880df39", + "metadata": {}, + "outputs": [], + "source": [ + "# кросс валидация" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "0290cf24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0.78324412, 0.73308622, 0.76224678])" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_score = cross_val_score(rf_model, X_train, y_train, scoring='r2', cv=KFold(n_splits=3, shuffle=True, random_state=21))\n", + "cv_score" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "5d373cf1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7595257056684376" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cv_score.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "b5be4638", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
feature_nameimportance
23MedPriceByDistrict6.377276e-01
1Square1.169206e-01
26MeterPriceByDistrictRooms5.302766e-02
2LifeSquare2.742734e-02
6HouseYear1.880421e-02
24MedPriceByFloorYear1.836782e-02
5HouseFloor1.637686e-02
25MeterPrice1.623973e-02
4Floor1.337890e-02
3KitchenSquare1.172162e-02
12Social_31.140762e-02
7Ecology_11.114983e-02
11Social_21.038791e-02
20DistrictSize1.014266e-02
10Social_19.488800e-03
14Shops_17.056769e-03
13Helthcare_24.669791e-03
17HouseFloor_outlier1.497970e-03
0Rooms1.197059e-03
19LifeSquare_nan8.821891e-04
15Shops_28.546742e-04
9Ecology_36.578909e-04
22IsDistrictLarge4.151446e-04
8Ecology_21.322893e-04
16Rooms_outlier6.708606e-05
18HouseYear_outlier1.218268e-08
21new_district0.000000e+00
\n", + "
" + ], + "text/plain": [ + " feature_name importance\n", + "23 MedPriceByDistrict 6.377276e-01\n", + "1 Square 1.169206e-01\n", + "26 MeterPriceByDistrictRooms 5.302766e-02\n", + "2 LifeSquare 2.742734e-02\n", + "6 HouseYear 1.880421e-02\n", + "24 MedPriceByFloorYear 1.836782e-02\n", + "5 HouseFloor 1.637686e-02\n", + "25 MeterPrice 1.623973e-02\n", + "4 Floor 1.337890e-02\n", + "3 KitchenSquare 1.172162e-02\n", + "12 Social_3 1.140762e-02\n", + "7 Ecology_1 1.114983e-02\n", + "11 Social_2 1.038791e-02\n", + "20 DistrictSize 1.014266e-02\n", + "10 Social_1 9.488800e-03\n", + "14 Shops_1 7.056769e-03\n", + "13 Helthcare_2 4.669791e-03\n", + "17 HouseFloor_outlier 1.497970e-03\n", + "0 Rooms 1.197059e-03\n", + "19 LifeSquare_nan 8.821891e-04\n", + "15 Shops_2 8.546742e-04\n", + "9 Ecology_3 6.578909e-04\n", + "22 IsDistrictLarge 4.151446e-04\n", + "8 Ecology_2 1.322893e-04\n", + "16 Rooms_outlier 6.708606e-05\n", + "18 HouseYear_outlier 1.218268e-08\n", + "21 new_district 0.000000e+00" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# веса признаков\n", + "feature_importances = pd.DataFrame(zip(X_train.columns, rf_model.feature_importances_), \n", + " columns=['feature_name', 'importance'])\n", + "\n", + "feature_importances.sort_values(by='importance', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "7d95cb34", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5000, 27)" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# выгрузка результатов\n", + "test_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "90e14b5d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPrice
04567200000.0
15925200000.0
2960200000.0
33848200000.0
4746200000.0
\n", + "
" + ], + "text/plain": [ + " Id Price\n", + "0 4567 200000.0\n", + "1 5925 200000.0\n", + "2 960 200000.0\n", + "3 3848 200000.0\n", + "4 746 200000.0" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "submit = pd.read_csv('C:/ARTEM/GeekBrains/Python4DS/RealEstatePricePredictionMoscow/sample_submission.csv')\n", + "submit.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "2c01f0a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([164129.41193348, 117169.5995882 , 139193.78182605, ...,\n", + " 148103.70058817, 189761.30723796, 229074.87215388])" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = rf_model.predict(test_df)\n", + "predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "6c9f1523", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPrice
04567164129.411933
15925117169.599588
2960139193.781826
33848169578.024348
4746171922.946686
\n", + "
" + ], + "text/plain": [ + " Id Price\n", + "0 4567 164129.411933\n", + "1 5925 117169.599588\n", + "2 960 139193.781826\n", + "3 3848 169578.024348\n", + "4 746 171922.946686" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "submit['Price'] = predictions\n", + "submit.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "e6052d38", + "metadata": {}, + "outputs": [], + "source": [ + "submit.to_csv('C:/ARTEM/GeekBrains/Python4DS/RealEstatePricePredictionMoscow/rf_submit.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}