# Housing Prices in São Paulo

This notebook gathers information about housing prices and their sizes on the city of São Paulo, Brazil.

In [1]:
# Import Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib notebook

This dataset is gathering information from [Imovel Web](http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-1.html), a brazilian online real estate portal.

In [2]:
def getURL(page_number):
    base_url = "http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-"
    end_url = ".html"
    url = base_url + str(page_number) + end_url
    return url

In [3]:
def num(s):
    try:
        return int(s)
    except ValueError:
        return float(s)

In [None]:
def grab_data(url, i):
    try:
        result = requests.get(url)
        page = BeautifulSoup(result.content, "html5lib")
        items = page.find_all('li', class_='post')
        for item in items:
            title = item.find("a", class_='dl-aviso-link').get('title')
            price = item.find("span", class_='precio-valor').string.replace("R$","").replace(".","").strip()
            size = item.find("li", class_='post-m2totales')
            if size is not None:
                size = size.text.replace("total","").strip()
                #print(size + " - " + price + " - " + title)
                price = num(str(price))/1000
                size = num(str(size.replace("m²","")))
                df.loc[i] = [size, price]
                i = i + 1
        return i
    except:
        print("--> ERROR")
        return i

In [None]:
df = pd.DataFrame([], columns=('size', 'price'))
i = 0
for page_number in  range(1,100):
    url = getURL(page_number)
    print(str(page_number) + " - " + url)
    i = grab_data(url, i)
df.tail()       

1 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-1.html
2 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-2.html
3 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-3.html
4 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-4.html
5 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-5.html
6 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-6.html
7 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-7.html
8 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-8.html
9 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-9.html
10 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-10.html
11 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-11.html
12 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-12.html
13 - http://www.imovelweb.com.br/imoveis-venda-sao-paulo-sp-pagina-13.html
14 - http://www.imovelweb.com.br/imoveis-ve

In [None]:
df.plot(x="size",  y="price", kind='scatter', color='DarkBlue', xlim=(0, 350), ylim=(0, 2500000))
plt.xlabel("Size (m²)")
plt.ylabel("Price (R$)")