Goal of the notebook:

To get data of houses being listed for sale on Realtor.com

Future aim is to make predictions from the data

TO DO:

1) Make proper markdowns in the notebooks

2) Improve variable names and code structure

3) Check whether data is accurate using regex or do it manually or visually

4) TRY stuff from the above links (see all)

5) Follow a ML pipeline for prediction -- make a model

6) Make a verdict

7) Write a report

8) Deploy the model (maybe using flask or django or some other way)

from bs4 import BeautifulSoup # web scrapping package
from urllib.request import Request, urlopen # since we will send a request to a web page and we will get a response back (some html)
import requests
from csv import writer 
url = "https://www.realtor.com/realestateandhomes-search/Palo-Alto_CA"
requests = Request(url, headers={'User-Agent':'Mozilla/5.0'})
#print(requests)
webpage = urlopen(requests).read()
#print(webpage)
soup = BeautifulSoup(webpage, "html.parser")
#print(soup)
# the second argument is "class_" instead of just "class" beacuse we are dealing with css
find_price_content = soup.find_all("span", class_="rui__x3geed-0 kitA-dS") 
find_address_content = soup.find_all("div", class_="jsx-1489967104 address ellipsis srp-page-address srp-address-redesign")
find_space_content = soup.find_all("ul", class_="jsx-946479843 property-meta list-unstyled property-meta-srpPage")#joint no space
price = []
for i in find_price_content:
    price.append(i.text)

address = []
for i in find_address_content:
    address.append(i.text)

space = []
for i in find_space_content:
    space.append(i.text)
print(space)
['2bed1bath660sqft2,325sqft lot', '2bed1bath865sqft875sqft lot', '2bed2.5bath1,230sqft630sqft lot', '2bed2bath1,440sqft', '3bed1bath1,004sqft7,748sqft lot', '2bed3bath1,490sqft961sqft lot', '2bed1bath943sqft', '5bed5bath4,003sqft9,523sqft lot', '4bed2bath1,554sqft6,292sqft lot', '2bed2.5bath1,295sqft630sqft lot', '4bed2bath2,066sqft5,849sqft lot', '1bed1bath885sqft', '3bed2bath1,710sqft9,400sqft lot', '2bed2.5bath1,468sqft', '2bed1bath1,091sqft', '4bed3.5+bath4,540sqft1.02acre lot', '5bed4.5+bath5,042sqft8,755sqft lot', '3bed2bath1,583sqft6,500sqft lot', '1bed1bath876sqft', '2bed1bath946sqft', '2bed2bath773sqft2,000sqft lot', '6bed5bath3,456sqft7,670sqft lot', '5bed4bath2,871sqft5,000sqft lot', '5bed4bath3,072sqft7,605sqft lot', '6bed5.5bath3,853sqft6,382sqft lot', '4bed2bath1,639sqft7,084sqft lot', '4bed2bath1,664sqft6,504sqft lot', '3bed2.5bath1,494sqft', '4bed2.5bath2,697sqft8,508sqft lot', '4bed3bath2,410sqft6,380sqft lot', '2bed2bath1,015sqft', '2bed2.5bath1,968sqft0.24acre lot', '3bed2bath1,686sqft7,620sqft lot', '5bed5bath3,713sqft7,500sqft lot', '4bed3.5bath3,000sqft8,094sqft lot', '4bed3bath2,383sqft8,910sqft lot', '4bed2.5bath2,046sqft7,386sqft lot', '3bed2bath1,479sqft8,801sqft lot', '8bed3,120sqft6,749sqft lot', '10bed4,266sqft8,919sqft lot', '5bed4bath3,085sqft6,050sqft lot', '4bed4bath2,618sqft6,000sqft lot']
def regex_check_for_space(l, regex_string, original_string):
    """
    Goal of the function was to mainly remove redundant code
    """
    if regex_string is None:
        l.append(original_string)
    else:
        l.append(regex_string.group())
    return l
# we seperate the data

# Regular Expression package
import re

beds = []
bath = []
area = []
area_lot = []
for i in space:
    search_bed = re.search(pattern="[0-9]+bed", string=i)
    regex_check_for_space(beds, search_bed, i)
        
    search_bath = re.search(pattern="[0-9]+bath", string=i)
    regex_check_for_space(bath, search_bath, i)
    
    # area lot is checked before because both area and area lot have the sqft word in commmon
    search_area_lot = re.search(pattern="[0-9]+sqft\slot|[0-9]+,+[0-9]+sqft\slot", string=i) 
    regex_check_for_space(area_lot, search_area_lot, i)
    
    search_area = re.search(pattern="[0-9]+sqft|[0-9]+,+[0-9]+sqft", string=i)
    regex_check_for_space(area, search_area, i)
for i in zip(price, address, space, beds, bath, area, area_lot):
    print(i)
('$1,599,000', '736 Homer Ave, Palo Alto, CA 94301', '2bed1bath660sqft2,325sqft lot', '2bed', '1bath', '660sqft', '2,325sqft lot')
('$1,098,000', '280 Waverley St, Palo Alto, CA 94301', '2bed1bath865sqft875sqft lot', '2bed', '1bath', '865sqft', '875sqft lot')
('$1,380,000', '2585 Park Blvd Apt Z206, Palo Alto, CA 94306', '2bed2.5bath1,230sqft630sqft lot', '2bed', '5bath', '1,230sqft', '630sqft lot')
('$1,790,000', '101 Alma St Apt 805, Palo Alto, CA 94301', '2bed2bath1,440sqft', '2bed', '2bath', '1,440sqft', '2bed2bath1,440sqft')
('$1,795,000', '3109 Maddux Dr, Palo Alto, CA 94303', '3bed1bath1,004sqft7,748sqft lot', '3bed', '1bath', '1,004sqft', '7,748sqft lot')
('$1,850,000', '685 High St Apt 5F, Palo Alto, CA 94301', '2bed3bath1,490sqft961sqft lot', '2bed', '3bath', '1,490sqft', '961sqft lot')
('$949,000', '4250 El Camino Real Apt A307, Palo Alto, CA 94306', '2bed1bath943sqft', '2bed', '1bath', '943sqft', '2bed1bath943sqft')
('$9,500,000', '2111 Barbara Dr, Palo Alto, CA 94303', '5bed5bath4,003sqft9,523sqft lot', '5bed', '5bath', '4,003sqft', '9,523sqft lot')
('$3,100,000', '274 Tennessee Ln, Palo Alto, CA 94306', '4bed2bath1,554sqft6,292sqft lot', '4bed', '2bath', '1,554sqft', '6,292sqft lot')
('$1,698,000', '1345 Alma St, Palo Alto, CA 94301', '2bed2.5bath1,295sqft630sqft lot', '2bed', '5bath', '1,295sqft', '630sqft lot')
('$3,999,888', '485 Marion Ave, Palo Alto, CA 94301', '4bed2bath2,066sqft5,849sqft lot', '4bed', '2bath', '2,066sqft', '5,849sqft lot')
('$875,000', '4250 El Camino Real Apt A102, Palo Alto, CA 94306', '1bed1bath885sqft', '1bed', '1bath', '885sqft', '1bed1bath885sqft')
('$3,100,000', '4233 Los Palos Ave, Palo Alto, CA 94306', '3bed2bath1,710sqft9,400sqft lot', '3bed', '2bath', '1,710sqft', '9,400sqft lot')
('$1,850,000', '685 High St Apt 2C, Palo Alto, CA 94301', '2bed2.5bath1,468sqft', '2bed', '5bath', '1,468sqft', '2bed2.5bath1,468sqft')
('$1,399,000', '518 Everett Ave Apt A, Palo Alto, CA 94301', '2bed1bath1,091sqft', '2bed', '1bath', '1,091sqft', '2bed1bath1,091sqft')
('$10,495,000', '27 Crescent Dr, Palo Alto, CA 94301', '4bed3.5+bath4,540sqft1.02acre lot', '4bed', '4bed3.5+bath4,540sqft1.02acre lot', '4,540sqft', '4bed3.5+bath4,540sqft1.02acre lot')
('$9,280,000', '2001 Webster St, Palo Alto, CA 94301', '5bed4.5+bath5,042sqft8,755sqft lot', '5bed', '5bed4.5+bath5,042sqft8,755sqft lot', '5,042sqft', '8,755sqft lot')
('$2,995,000', '3894 Corina Way, Palo Alto, CA 94303', '3bed2bath1,583sqft6,500sqft lot', '3bed', '2bath', '1,583sqft', '6,500sqft lot')
('$1,099,000', '3073 Middlefield Rd Apt 101, Palo Alto, CA 94306', '1bed1bath876sqft', '1bed', '1bath', '876sqft', '1bed1bath876sqft')
('$998,000', '777 San Antonio Rd Apt 4, Palo Alto, CA 94303', '2bed1bath946sqft', '2bed', '1bath', '946sqft', '2bed1bath946sqft')
('$2,395,000', '678 Hawthorne Ave, Palo Alto, CA 94301', '2bed2bath773sqft2,000sqft lot', '2bed', '2bath', '773sqft', '2,000sqft lot')
('$5,968,000', '919 Amarillo Ave, Palo Alto, CA 94303', '6bed5bath3,456sqft7,670sqft lot', '6bed', '5bath', '3,456sqft', '7,670sqft lot')
('$2,998,000', '3661 Park Blvd, Palo Alto, CA 94306', '5bed4bath2,871sqft5,000sqft lot', '5bed', '4bath', '2,871sqft', '5,000sqft lot')
('$4,488,000', '10 Crescent Dr, Palo Alto, CA 94301', '5bed4bath3,072sqft7,605sqft lot', '5bed', '4bath', '3,072sqft', '7,605sqft lot')
('$4,988,888', '3424 Cowper Ct, Palo Alto, CA 94306', '6bed5.5bath3,853sqft6,382sqft lot', '6bed', '5bath', '3,853sqft', '6,382sqft lot')
('$2,500,000', '971 Oregon Ave, Palo Alto, CA 94303', '4bed2bath1,639sqft7,084sqft lot', '4bed', '2bath', '1,639sqft', '7,084sqft lot')
('$2,998,000', '47 Roosevelt Cir, Palo Alto, CA 94306', '4bed2bath1,664sqft6,504sqft lot', '4bed', '2bath', '1,664sqft', '6,504sqft lot')
('$1,688,000', '767 Loma Verde Ave Unit B, Palo Alto, CA 94303', '3bed2.5bath1,494sqft', '3bed', '5bath', '1,494sqft', '3bed2.5bath1,494sqft')
('$6,500,000', '606 Santa Rita Ave, Palo Alto, CA 94301', '4bed2.5bath2,697sqft8,508sqft lot', '4bed', '5bath', '2,697sqft', '8,508sqft lot')
('$3,998,000', '872 Boyce Ave, Palo Alto, CA 94301', '4bed3bath2,410sqft6,380sqft lot', '4bed', '3bath', '2,410sqft', '6,380sqft lot')
('$1,598,000', '436 High St Apt 105, Palo Alto, CA 94301', '2bed2bath1,015sqft', '2bed', '2bath', '1,015sqft', '2bed2bath1,015sqft')
('$4,450,000', '530 Jefferson Dr, Palo Alto, CA 94303', '2bed2.5bath1,968sqft0.24acre lot', '2bed', '5bath', '1,968sqft', '2bed2.5bath1,968sqft0.24acre lot')
('$2,988,000', '3760 El Centro St, Palo Alto, CA 94306', '3bed2bath1,686sqft7,620sqft lot', '3bed', '2bath', '1,686sqft', '7,620sqft lot')
('$4,488,000', '118 Churchill Ave, Palo Alto, CA 94301', '5bed5bath3,713sqft7,500sqft lot', '5bed', '5bath', '3,713sqft', '7,500sqft lot')
('$5,695,000', '776 Rosewood Dr, Palo Alto, CA 94303', '4bed3.5bath3,000sqft8,094sqft lot', '4bed', '5bath', '3,000sqft', '8,094sqft lot')
('$3,488,000', '3349 Saint Michael Ct, Palo Alto, CA 94306', '4bed3bath2,383sqft8,910sqft lot', '4bed', '3bath', '2,383sqft', '8,910sqft lot')
('$4,798,000', '2388 Waverley St, Palo Alto, CA 94301', '4bed2.5bath2,046sqft7,386sqft lot', '4bed', '5bath', '2,046sqft', '7,386sqft lot')
('$2,900,000', '3421 Kenneth Dr, Palo Alto, CA 94303', '3bed2bath1,479sqft8,801sqft lot', '3bed', '2bath', '1,479sqft', '8,801sqft lot')
('$3,200,000', '290 Curtner Ave, Palo Alto, CA 94306', '8bed3,120sqft6,749sqft lot', '8bed', '8bed3,120sqft6,749sqft lot', '3,120sqft', '6,749sqft lot')
('$4,200,000', '4211 McKellar Ln, Palo Alto, CA 94306', '10bed4,266sqft8,919sqft lot', '10bed', '10bed4,266sqft8,919sqft lot', '4,266sqft', '8,919sqft lot')
('$4,280,000', '1031 Embarcadero Rd, Palo Alto, CA 94303', '5bed4bath3,085sqft6,050sqft lot', '5bed', '4bath', '3,085sqft', '6,050sqft lot')
('$4,050,000', '220 Matadero Ave, Palo Alto, CA 94306', '4bed4bath2,618sqft6,000sqft lot', '4bed', '4bath', '2,618sqft', '6,000sqft lot')
import pandas as pd 

df = pd.DataFrame(list(zip(price, address, space, beds, bath, area, area_lot)))
print(df.head())
            0                                             1  \
0  $1,599,000            736 Homer Ave, Palo Alto, CA 94301   
1  $1,098,000          280 Waverley St, Palo Alto, CA 94301   
2  $1,380,000  2585 Park Blvd Apt Z206, Palo Alto, CA 94306   
3  $1,790,000      101 Alma St Apt 805, Palo Alto, CA 94301   
4  $1,795,000           3109 Maddux Dr, Palo Alto, CA 94303   

                                 2     3      4          5                   6  
0    2bed1bath660sqft2,325sqft lot  2bed  1bath    660sqft       2,325sqft lot  
1      2bed1bath865sqft875sqft lot  2bed  1bath    865sqft         875sqft lot  
2  2bed2.5bath1,230sqft630sqft lot  2bed  5bath  1,230sqft         630sqft lot  
3               2bed2bath1,440sqft  2bed  2bath  1,440sqft  2bed2bath1,440sqft  
4  3bed1bath1,004sqft7,748sqft lot  3bed  1bath  1,004sqft       7,748sqft lot  

Import to CSV

df.to_csv('Palo Alto houses form realtor.csv', sep='\t')