{ "cells": [ { "cell_type": "code", "execution_count": 73, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:15.759516Z", "start_time": "2018-12-16T11:12:14.836378Z" } }, "outputs": [], "source": [ "import requests, pandas as pd, numpy as np\n", "from requests import session\n", "from bs4 import BeautifulSoup" ] }, { "cell_type": "code", "execution_count": 229, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:15.834578Z", "start_time": "2018-12-16T11:12:15.763518Z" } }, "outputs": [], "source": [ "dfsi=pd.read_csv('dfsi.csv',sep=';')" ] }, { "cell_type": "code", "execution_count": 230, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:15.899576Z", "start_time": "2018-12-16T11:12:15.838583Z" } }, "outputs": [], "source": [ "dfsi2=pd.read_csv('dfsi2.csv',sep=';')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Normalize" ] }, { "cell_type": "code", "execution_count": 231, "metadata": {}, "outputs": [], "source": [ "dfsi.columns=['Unnamed: 0', '0', 'tavaly', 'Cégnév',\n", " 'Alkalmazottak száma 2018', 'Alkalmazottak száma 2017',\n", " 'Alkalmazottak száma 2016','Alkalmazottak száma 2015', 'region', 'nr', 'coords', 'kws', 'cms']\n", "dfsi2.columns=['Unnamed: 0', '0', 'tavaly', 'Cégnév',\n", " 'Árbevétel 2018 (RON)', 'Árbevétel 2017 (RON)', 'Árbevétel 2016 (RON)','Árbevétel 2015 (RON)',\n", " 'region', 'nr', 'coords', 'kws', 'cms']" ] }, { "cell_type": "code", "execution_count": 232, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:15.911583Z", "start_time": "2018-12-16T11:12:15.904585Z" } }, "outputs": [], "source": [ "dfsi['nr_alkalmazottak']=dfsi['nr']\n", "dfsi2['nr_arbevetel']=dfsi2['nr']" ] }, { "cell_type": "code", "execution_count": 233, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 00tavalyCégnévAlkalmazottak száma 2018Alkalmazottak száma 2017Alkalmazottak száma 2016Alkalmazottak száma 2015regionnrcoordskwscmsnr_alkalmazottak
0012READY GARMENT TECHNOLOGY ROMANIA SRL9907301 0051 199also-haromszek1['45.8772830', ' 25.7994510']['mosas-tisztitas', '', '\"']520036, Sepsiszentgy\\xf6rgy, Cs\\xedki u., 149/A\\t1
1121VALKES SRL6217888861 007also-haromszek2['45.855129 ', ' 25.806651 ']['kabelek', '', '\"']520077, Sepsiszentgy\\xf6rgy, Păiş Da...2
\n", "
" ], "text/plain": [ " Unnamed: 0 0 tavaly Cégnév \\\n", "0 0 1 2 READY GARMENT TECHNOLOGY ROMANIA SRL \n", "1 1 2 1 VALKES SRL \n", "\n", " Alkalmazottak száma 2018 Alkalmazottak száma 2017 Alkalmazottak száma 2016 \\\n", "0 990 730 1 005 \n", "1 621 788 886 \n", "\n", " Alkalmazottak száma 2015 region nr coords \\\n", "0 1 199 also-haromszek 1 ['45.8772830', ' 25.7994510'] \n", "1 1 007 also-haromszek 2 ['45.855129 ', ' 25.806651 '] \n", "\n", " kws \\\n", "0 ['mosas-tisztitas', '', '\"'] \n", "1 ['kabelek', '', '\"'] \n", "\n", " cms nr_alkalmazottak \n", "0 520036, Sepsiszentgy\\xf6rgy, Cs\\xedki u., 149/A\\t 1 \n", "1 520077, Sepsiszentgy\\xf6rgy, Păiş Da... 2 " ] }, "execution_count": 233, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfsi.head(2)" ] }, { "cell_type": "code", "execution_count": 234, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 00tavalyCégnévÁrbevétel 2018 (RON)Árbevétel 2017 (RON)Árbevétel 2016 (RON)Árbevétel 2015 (RON)regionnrcoordskwscmsnr_arbevetel
0011FABRICA DE LAPTE BRASOV SA468 016 207431 950 759370 418 528338 283 110also-haromszek1['46.0750630', ' 25.6087250']['tejfeldolgozas', '', '\"']525100, Bar\\xf3t, V\\xedz u., 109\\t1
1122COVALACT SA252 640 519235 415 329230 073 646199 493 229also-haromszek2['45.869573 ', ' 25.800705 ']['tejfeldolgozas', '', '\"']520036, Sepsiszentgy\\xf6rgy, Oltmez\\xf5 u., 1\\t2
\n", "
" ], "text/plain": [ " Unnamed: 0 0 tavaly Cégnév Árbevétel 2018 (RON) \\\n", "0 0 1 1 FABRICA DE LAPTE BRASOV SA 468 016 207 \n", "1 1 2 2 COVALACT SA 252 640 519 \n", "\n", " Árbevétel 2017 (RON) Árbevétel 2016 (RON) Árbevétel 2015 (RON) \\\n", "0 431 950 759 370 418 528 338 283 110 \n", "1 235 415 329 230 073 646 199 493 229 \n", "\n", " region nr coords \\\n", "0 also-haromszek 1 ['46.0750630', ' 25.6087250'] \n", "1 also-haromszek 2 ['45.869573 ', ' 25.800705 '] \n", "\n", " kws \\\n", "0 ['tejfeldolgozas', '', '\"'] \n", "1 ['tejfeldolgozas', '', '\"'] \n", "\n", " cms nr_arbevetel \n", "0 525100, Bar\\xf3t, V\\xedz u., 109\\t 1 \n", "1 520036, Sepsiszentgy\\xf6rgy, Oltmez\\xf5 u., 1\\t 2 " ] }, "execution_count": 234, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfsi2.head(2)" ] }, { "cell_type": "code", "execution_count": 235, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:15.941582Z", "start_time": "2018-12-16T11:12:15.915585Z" } }, "outputs": [], "source": [ "data=dfsi.set_index('Cégnév').join(dfsi2.set_index('Cégnév'),how='outer',lsuffix='_left', rsuffix='_right')" ] }, { "cell_type": "code", "execution_count": 236, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:16.604103Z", "start_time": "2018-12-16T11:12:15.945583Z" } }, "outputs": [], "source": [ "data['Cím']=data[['cms_right','cms_left']].T.ffill().bfill().T['cms_right']\n", "data['Koord']=data[['coords_right','coords_left']].T.ffill().bfill().T['coords_right']\n", "data['Kw']=data[['kws_right','kws_left']].T.ffill().bfill().T['kws_right']\n", "data['Régió']=data[['region_right','region_left']].T.ffill().bfill().T['region_right']" ] }, { "cell_type": "code", "execution_count": 237, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:16.620101Z", "start_time": "2018-12-16T11:12:16.609111Z" } }, "outputs": [], "source": [ "data=data.drop(['0_left', 'region_left','nr_left', 'coords_left', 'kws_left', 'cms_left',\n", " '0_right', 'region_right', 'nr_right', 'coords_right', 'kws_right', 'cms_right',\n", " 'Unnamed: 0_left','Unnamed: 0_right'],axis=1).reset_index()" ] }, { "cell_type": "code", "execution_count": 238, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Cégnévtavaly_leftAlkalmazottak száma 2018Alkalmazottak száma 2017Alkalmazottak száma 2016Alkalmazottak száma 2015nr_alkalmazottaktavaly_rightÁrbevétel 2018 (RON)Árbevétel 2017 (RON)Árbevétel 2016 (RON)Árbevétel 2015 (RON)nr_arbevetelCímKoordKwRégió
0A M C SRLNaNNaNNaNNaNNaNNaN2028 716 06127 881 27024 390 00522 173 03922.0527100, H\\xeddv\\xe9g, Rom\\xe1n u., 203\\t['45.8391350', ' 25.5892160']['nagykereskedelem', '', '\"']also-haromszek
1ABC IMPEX SRL2013914513213221.02037 875 37236 539 53033 071 73331 956 49824.0535600, Sz\\xe9kelyudvarhely, R\\xe1k\\xf3czi Fer...['46.289768 ', ' 25.290034 ']['csomagoloanyagok', 'nyomdak', '', '\"']udvarhelyszek
2ABRAZIV SRLNaNNaNNaNNaNNaNNaN4916 316 7055 483 2750017.0535500, Gyergy\\xf3szentmikl\\xf3s, \\xc1llom\\xe1...['46.7174250', ' 25.5751650']['szerszamgepek', '', '\"']gyergyoszek
3ADILEX FUNGO SRLNaNNaNNaNNaNNaNNaN>506 834 996925 866--50.0537355, Vasl\\xe1b, , 37\\t['46.64607', ' 25.62371']['zoldseg-gyumolcs-csomagolas', '', '\"']gyergyoszek
4ADIMAG COM IMPEX SRLNaNNaNNaNNaNNaNNaN3665 548 66357 106 22448 827 47147 437 38940.0540190, Marosv\\xe1s\\xe1rhely, Szabads\\xe1g u.,...['46.537905', ' 24.548819']['belsoepiteszeti-anyagok', 'epitoanyagok', 'f...marosszek
\n", "
" ], "text/plain": [ " Cégnév tavaly_left Alkalmazottak száma 2018 \\\n", "0 A M C SRL NaN NaN \n", "1 ABC IMPEX SRL 20 139 \n", "2 ABRAZIV SRL NaN NaN \n", "3 ADILEX FUNGO SRL NaN NaN \n", "4 ADIMAG COM IMPEX SRL NaN NaN \n", "\n", " Alkalmazottak száma 2017 Alkalmazottak száma 2016 Alkalmazottak száma 2015 \\\n", "0 NaN NaN NaN \n", "1 145 132 132 \n", "2 NaN NaN NaN \n", "3 NaN NaN NaN \n", "4 NaN NaN NaN \n", "\n", " nr_alkalmazottak tavaly_right Árbevétel 2018 (RON) Árbevétel 2017 (RON) \\\n", "0 NaN 20 28 716 061 27 881 270 \n", "1 21.0 20 37 875 372 36 539 530 \n", "2 NaN 49 16 316 705 5 483 275 \n", "3 NaN >50 6 834 996 925 866 \n", "4 NaN 36 65 548 663 57 106 224 \n", "\n", " Árbevétel 2016 (RON) Árbevétel 2015 (RON) nr_arbevetel \\\n", "0 24 390 005 22 173 039 22.0 \n", "1 33 071 733 31 956 498 24.0 \n", "2 0 0 17.0 \n", "3 - - 50.0 \n", "4 48 827 471 47 437 389 40.0 \n", "\n", " Cím \\\n", "0 527100, H\\xeddv\\xe9g, Rom\\xe1n u., 203\\t \n", "1 535600, Sz\\xe9kelyudvarhely, R\\xe1k\\xf3czi Fer... \n", "2 535500, Gyergy\\xf3szentmikl\\xf3s, \\xc1llom\\xe1... \n", "3 537355, Vasl\\xe1b, , 37\\t \n", "4 540190, Marosv\\xe1s\\xe1rhely, Szabads\\xe1g u.,... \n", "\n", " Koord \\\n", "0 ['45.8391350', ' 25.5892160'] \n", "1 ['46.289768 ', ' 25.290034 '] \n", "2 ['46.7174250', ' 25.5751650'] \n", "3 ['46.64607', ' 25.62371'] \n", "4 ['46.537905', ' 24.548819'] \n", "\n", " Kw Régió \n", "0 ['nagykereskedelem', '', '\"'] also-haromszek \n", "1 ['csomagoloanyagok', 'nyomdak', '', '\"'] udvarhelyszek \n", "2 ['szerszamgepek', '', '\"'] gyergyoszek \n", "3 ['zoldseg-gyumolcs-csomagolas', '', '\"'] gyergyoszek \n", "4 ['belsoepiteszeti-anyagok', 'epitoanyagok', 'f... marosszek " ] }, "execution_count": 238, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 239, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:16.792101Z", "start_time": "2018-12-16T11:12:16.626106Z" } }, "outputs": [], "source": [ "lat=[]\n", "lon=[]\n", "for i in range(len(data.index)):\n", " k=data.loc[data.index[i]]['Koord'].replace(\"'\",'').replace('[','')\\\n", " .replace(']','').replace(' ','').split(',')\n", " lon.append(k[0])\n", " lat.append(k[1])" ] }, { "cell_type": "code", "execution_count": 240, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:16.803108Z", "start_time": "2018-12-16T11:12:16.796102Z" } }, "outputs": [], "source": [ "data['Latitude']=lat\n", "data['Longitude']=lon" ] }, { "cell_type": "code", "execution_count": 241, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:16.838104Z", "start_time": "2018-12-16T11:12:16.807102Z" } }, "outputs": [], "source": [ "data=data.drop('Koord',axis=1)" ] }, { "cell_type": "code", "execution_count": 242, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:12:16.999105Z", "start_time": "2018-12-16T11:12:16.851102Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Cégnévtavaly_leftAlkalmazottak száma 2018Alkalmazottak száma 2017Alkalmazottak száma 2016Alkalmazottak száma 2015nr_alkalmazottaktavaly_rightÁrbevétel 2018 (RON)Árbevétel 2017 (RON)Árbevétel 2016 (RON)Árbevétel 2015 (RON)nr_arbevetelCímKwRégióLatitudeLongitude
0A M C SRLNaNNaNNaNNaNNaNNaN2028 716 06127 881 27024 390 00522 173 03922.0527100, H\\xeddv\\xe9g, Rom\\xe1n u., 203\\t['nagykereskedelem', '', '\"']also-haromszek25.589216045.8391350
1ABC IMPEX SRL2013914513213221.02037 875 37236 539 53033 071 73331 956 49824.0535600, Sz\\xe9kelyudvarhely, R\\xe1k\\xf3czi Fer...['csomagoloanyagok', 'nyomdak', '', '\"']udvarhelyszek25.29003446.289768
2ABRAZIV SRLNaNNaNNaNNaNNaNNaN4916 316 7055 483 2750017.0535500, Gyergy\\xf3szentmikl\\xf3s, \\xc1llom\\xe1...['szerszamgepek', '', '\"']gyergyoszek25.575165046.7174250
3ADILEX FUNGO SRLNaNNaNNaNNaNNaNNaN>506 834 996925 866--50.0537355, Vasl\\xe1b, , 37\\t['zoldseg-gyumolcs-csomagolas', '', '\"']gyergyoszek25.6237146.64607
4ADIMAG COM IMPEX SRLNaNNaNNaNNaNNaNNaN3665 548 66357 106 22448 827 47147 437 38940.0540190, Marosv\\xe1s\\xe1rhely, Szabads\\xe1g u.,...['belsoepiteszeti-anyagok', 'epitoanyagok', 'f...marosszek24.54881946.537905
.........................................................
419WONDERLAND SRL442829282946.0NaNNaNNaNNaNNaNNaN527160, Torja, F\\xf5 u., 225\\t['epitkezes', '', '\"']felso-haromszek26.06374646.040239
420ZABOLA ESTATE SRL394531252328.0NaNNaNNaNNaNNaNNaN527190, Zabola, , 437\\t['hotelek', 'kastelyszallo', 'vendeglatas', ''...felso-haromszek26.198028745.8915563
421ZAMBELLI METAL SRL1520517916216314.01552 694 04642 016 92034 591 56227 891 89515.0520077, Sepsiszentgy\\xf6rgy, \\xc9p\\xedt\\xf5k u...['badogosmunkak', '', '\"']also-haromszek25.81851545.861235
422ZARAH MODEN SRL27647857788392.02154 604 899148 531 005141 121 629139 752 7122.0525400, K\\xe9zdiv\\xe1s\\xe1rhely, B\\xe9ke u., 27\\t['nadraggyartas', 'textilipar', '', 'INDUSTRIA...felso-haromszek26.135967045.9969390
423ZENCO TRANS SRL403637333341.0NaNNaNNaNNaNNaNNaN535700, Marosh\\xe9v\\xedz, , 2\\t['aruszallitas', '', '\"']gyergyoszek25.353404046.9260300
\n", "

424 rows × 18 columns

\n", "
" ], "text/plain": [ " Cégnév tavaly_left Alkalmazottak száma 2018 \\\n", "0 A M C SRL NaN NaN \n", "1 ABC IMPEX SRL 20 139 \n", "2 ABRAZIV SRL NaN NaN \n", "3 ADILEX FUNGO SRL NaN NaN \n", "4 ADIMAG COM IMPEX SRL NaN NaN \n", ".. ... ... ... \n", "419 WONDERLAND SRL 44 28 \n", "420 ZABOLA ESTATE SRL 39 45 \n", "421 ZAMBELLI METAL SRL 15 205 \n", "422 ZARAH MODEN SRL 2 764 \n", "423 ZENCO TRANS SRL 40 36 \n", "\n", " Alkalmazottak száma 2017 Alkalmazottak száma 2016 \\\n", "0 NaN NaN \n", "1 145 132 \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", ".. ... ... \n", "419 29 28 \n", "420 31 25 \n", "421 179 162 \n", "422 785 778 \n", "423 37 33 \n", "\n", " Alkalmazottak száma 2015 nr_alkalmazottak tavaly_right \\\n", "0 NaN NaN 20 \n", "1 132 21.0 20 \n", "2 NaN NaN 49 \n", "3 NaN NaN >50 \n", "4 NaN NaN 36 \n", ".. ... ... ... \n", "419 29 46.0 NaN \n", "420 23 28.0 NaN \n", "421 163 14.0 15 \n", "422 839 2.0 2 \n", "423 33 41.0 NaN \n", "\n", " Árbevétel 2018 (RON) Árbevétel 2017 (RON) Árbevétel 2016 (RON) \\\n", "0 28 716 061 27 881 270 24 390 005 \n", "1 37 875 372 36 539 530 33 071 733 \n", "2 16 316 705 5 483 275 0 \n", "3 6 834 996 925 866 - \n", "4 65 548 663 57 106 224 48 827 471 \n", ".. ... ... ... \n", "419 NaN NaN NaN \n", "420 NaN NaN NaN \n", "421 52 694 046 42 016 920 34 591 562 \n", "422 154 604 899 148 531 005 141 121 629 \n", "423 NaN NaN NaN \n", "\n", " Árbevétel 2015 (RON) nr_arbevetel \\\n", "0 22 173 039 22.0 \n", "1 31 956 498 24.0 \n", "2 0 17.0 \n", "3 - 50.0 \n", "4 47 437 389 40.0 \n", ".. ... ... \n", "419 NaN NaN \n", "420 NaN NaN \n", "421 27 891 895 15.0 \n", "422 139 752 712 2.0 \n", "423 NaN NaN \n", "\n", " Cím \\\n", "0 527100, H\\xeddv\\xe9g, Rom\\xe1n u., 203\\t \n", "1 535600, Sz\\xe9kelyudvarhely, R\\xe1k\\xf3czi Fer... \n", "2 535500, Gyergy\\xf3szentmikl\\xf3s, \\xc1llom\\xe1... \n", "3 537355, Vasl\\xe1b, , 37\\t \n", "4 540190, Marosv\\xe1s\\xe1rhely, Szabads\\xe1g u.,... \n", ".. ... \n", "419 527160, Torja, F\\xf5 u., 225\\t \n", "420 527190, Zabola, , 437\\t \n", "421 520077, Sepsiszentgy\\xf6rgy, \\xc9p\\xedt\\xf5k u... \n", "422 525400, K\\xe9zdiv\\xe1s\\xe1rhely, B\\xe9ke u., 27\\t \n", "423 535700, Marosh\\xe9v\\xedz, , 2\\t \n", "\n", " Kw Régió \\\n", "0 ['nagykereskedelem', '', '\"'] also-haromszek \n", "1 ['csomagoloanyagok', 'nyomdak', '', '\"'] udvarhelyszek \n", "2 ['szerszamgepek', '', '\"'] gyergyoszek \n", "3 ['zoldseg-gyumolcs-csomagolas', '', '\"'] gyergyoszek \n", "4 ['belsoepiteszeti-anyagok', 'epitoanyagok', 'f... marosszek \n", ".. ... ... \n", "419 ['epitkezes', '', '\"'] felso-haromszek \n", "420 ['hotelek', 'kastelyszallo', 'vendeglatas', ''... felso-haromszek \n", "421 ['badogosmunkak', '', '\"'] also-haromszek \n", "422 ['nadraggyartas', 'textilipar', '', 'INDUSTRIA... felso-haromszek \n", "423 ['aruszallitas', '', '\"'] gyergyoszek \n", "\n", " Latitude Longitude \n", "0 25.5892160 45.8391350 \n", "1 25.290034 46.289768 \n", "2 25.5751650 46.7174250 \n", "3 25.62371 46.64607 \n", "4 24.548819 46.537905 \n", ".. ... ... \n", "419 26.063746 46.040239 \n", "420 26.1980287 45.8915563 \n", "421 25.818515 45.861235 \n", "422 26.1359670 45.9969390 \n", "423 25.3534040 46.9260300 \n", "\n", "[424 rows x 18 columns]" ] }, "execution_count": 242, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 243, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:22:05.563869Z", "start_time": "2018-12-16T11:22:05.557864Z" } }, "outputs": [], "source": [ "manual=data[data['Longitude']=='']\n", "data=data[data['Longitude']!='']" ] }, { "cell_type": "code", "execution_count": 222, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:40:49.592428Z", "start_time": "2018-12-16T11:40:49.009253Z" } }, "outputs": [], "source": [ "data.to_excel('data.xlsx')\n", "manual.to_excel('manual.xlsx')\n", "manual.to_excel('manual_manual.xlsx')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fix sector, address data and coordinates manually" ] }, { "cell_type": "code", "execution_count": 244, "metadata": {}, "outputs": [], "source": [ "manual=pd.read_excel('manual_manual.xlsx')" ] }, { "cell_type": "code", "execution_count": 245, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", "of pandas will change to not sort by default.\n", "\n", "To accept the future behavior, pass 'sort=False'.\n", "\n", "To retain the current behavior and silence the warning, pass 'sort=True'.\n", "\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "data=pd.concat([data,manual]).set_index('Cégnév')" ] }, { "cell_type": "code", "execution_count": 246, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:34:37.319384Z", "start_time": "2018-12-16T11:34:37.281390Z" } }, "outputs": [], "source": [ "sectors=pd.read_excel('sectors.xlsx')\n", "sectormap={}\n", "repl={'á':'a','é':'e','í':'i','ó':'o','ú':'u','ü':'u','ű':'u','ő':'o','ö':'o'}\n", "for s in sectors.columns:\n", " for k in sectors[s].values:\n", " for j in str(k).replace(' ','').split(','):\n", " sectormap[j]=s\n", " for c in repl:\n", " j=j.replace(c,repl[c])\n", " sectormap[j]=s" ] }, { "cell_type": "code", "execution_count": 247, "metadata": { "ExecuteTime": { "end_time": "2018-12-16T11:37:53.247325Z", "start_time": "2018-12-16T11:37:53.070331Z" } }, "outputs": [], "source": [ "valid=[]\n", "kws=[]\n", "ki=-1\n", "for i in range(len(data.index)):\n", " ks=data.loc[data.index[i]]['Kw'].replace(\"'\",'').replace('[','')\\\n", " .replace(']','').replace(' ','').lower().split(',')[:-2]\n", " for k in ks:\n", " if k in sectormap:\n", " kws.append(sectormap[k])\n", " break\n", " k=k.replace('-','')\n", " if k in sectormap:\n", " kws.append(sectormap[k])\n", " break\n", " if len(kws)