Web Scrapping Real Estate Data
TO DO:
1) Make proper markdowns in the notebooks
2) Improve variable names and code structure
3) Check whether data is accurate using regex or do it manually or visually
4) TRY stuff from the above links (see all)
5) Follow a ML pipeline for prediction -- make a model
6) Make a verdict
7) Write a report
8) Deploy the model (maybe using flask or django or some other way)
Sources Used and Unused:
The links lead to tuorials of web scrapping (different methods) or searching using regex
https://www.realtor.com/realestateandhomes-search/Palo-Alto_CA
https://www.youtube.com/watch?v=RvCBzhhydNk
https://youtu.be/iESyyogOkY0 https://www.geeksforgeeks.org/python-extract-words-from-given-string/ https://medium.com/quantrium-tech/extracting-words-from-a-string-in-python-using-regex-dac4b385c1b8 https://www.guru99.com/python-regular-expressions-complete-tutorial.html
from bs4 import BeautifulSoup # web scrapping package
from urllib.request import Request, urlopen # since we will send a request to a web page and we will get a response back (some html)
import requests
from csv import writer
url = "https://www.realtor.com/realestateandhomes-search/Palo-Alto_CA"
requests = Request(url, headers={'User-Agent':'Mozilla/5.0'})
#print(requests)
webpage = urlopen(requests).read()
#print(webpage)
soup = BeautifulSoup(webpage, "html.parser")
#print(soup)
# the second argument is "class_" instead of just "class" beacuse we are dealing with css
find_price_content = soup.find_all("span", class_="rui__x3geed-0 kitA-dS")
find_address_content = soup.find_all("div", class_="jsx-1489967104 address ellipsis srp-page-address srp-address-redesign")
find_space_content = soup.find_all("ul", class_="jsx-946479843 property-meta list-unstyled property-meta-srpPage")#joint no space
price = []
for i in find_price_content:
price.append(i.text)
address = []
for i in find_address_content:
address.append(i.text)
space = []
for i in find_space_content:
space.append(i.text)
print(space)
def regex_check_for_space(l, regex_string, original_string):
"""
Goal of the function was to mainly remove redundant code
"""
if regex_string is None:
l.append(original_string)
else:
l.append(regex_string.group())
return l
# we seperate the data
# Regular Expression package
import re
beds = []
bath = []
area = []
area_lot = []
for i in space:
search_bed = re.search(pattern="[0-9]+bed", string=i)
regex_check_for_space(beds, search_bed, i)
search_bath = re.search(pattern="[0-9]+bath", string=i)
regex_check_for_space(bath, search_bath, i)
# area lot is checked before because both area and area lot have the sqft word in commmon
search_area_lot = re.search(pattern="[0-9]+sqft\slot|[0-9]+,+[0-9]+sqft\slot", string=i)
regex_check_for_space(area_lot, search_area_lot, i)
search_area = re.search(pattern="[0-9]+sqft|[0-9]+,+[0-9]+sqft", string=i)
regex_check_for_space(area, search_area, i)
for i in zip(price, address, space, beds, bath, area, area_lot):
print(i)
import pandas as pd
df = pd.DataFrame(list(zip(price, address, space, beds, bath, area, area_lot)))
print(df.head())
df.to_csv('Palo Alto houses form realtor.csv', sep='\t')