leboncoin/leboncoin.py

# -- coding: utf-8 --
__author__ = "yelfathi"


import argparse
import re
import requests
from bs4 import BeautifulSoup
from multiprocessing import Process
from email.mime.text import MIMEText


class Leboncoin(Process):
    URL = 'https://www.leboncoin.fr/locations/offres/aquitaine/?th=1&location=Toutes%20les%20communes%2040180&parrot=0'

    def __init__(self):
        Process.__init__(self)
        #self.keyw = re.sub('\s+', '+', keyw)
        self.keyw = "leboncoin"
        self.url = self.URL #+ self.keyw

    def run(self):
        try:
            req = requests.get(self.url, timeout=5)
        except requests.ConnectionError:
            raise LookupError('Could not reach host')

        # List of ads present on the URL page
        ad_id_list = []
        # Dict: key as LBC id of the ads, value as URL of the ads
        url_dict = {}
        # Dict: key as LBC id of the ads, value as the Title of the ads
        title_dict = {}

        soup = BeautifulSoup(req.content, 'html.parser')
        for ad in soup.find_all('a', {'title': True}):
            ad_date = ad.find_all('div', {'class': 'date'})
            for date in ad_date:
                print date
                if date.findAll('div')[0].text == "Aujourd'hui":
                    ad_url = ad['href']
                    ad_id = re.findall(r'([0-9]+)\.htm', ad_url)
                    ad_id_list.append(ad_id[0])
                    url_dict[str(ad_id[0])] = str(ad_url)
                    title_dict[str(ad_id[0])] = ad['title'].encode('utf-8')
                else:
                    ad_url = ad['href']
                    ad_id = re.findall(r'([0-9]+)\.htm', ad_url)
                    ad_id_list.append(ad_id[0])
                    url_dict[str(ad_id[0])] = str(ad_url)
                    title_dict[str(ad_id[0])] = ad['title'].encode('utf-8')

        with open(self.keyw+'-db.txt', 'a+') as my_file:
            archive = my_file.read().splitlines()
            message = ''
            for ad_id in ad_id_list:
                if ad_id not in archive:
                    my_file.seek(0, 2)  # For Microsoft Windows only
                    my_file.write(ad_id + "\n")
                    try:
                        req = requests.get(url_dict.get(ad_id), timeout=5)
                    except requests.ConnectionError:
                            raise LookupError('Could not reach host')

                    soup = BeautifulSoup(req.content, 'html.parser')
                    ad_price = soup.find('span', {'class': 'price'})
                    if ad_price:
                        ad_price = ad_price.text.strip()
                    else:
                        ad_price = 'Not specified'
                    ad_image = soup.find('div', {'class': 'print-lbcImages'})
                    if ad_image:
                        ad_image = str(ad_image.find_all('img')[0].get('src'))
                    else:
                        ad_image =\
                            'http://static.leboncoin.fr/img/logo_big_new.png'
                    ad_description = soup.find('div', 'content')
                    for tag in ad_description.findAll('br'):
                        tag.extract()
                    message += '<html><head></head><body><p><a href="'\
                        + url_dict.get(ad_id)\
                        + '">Link to LBC ad</a></p><p>Description: '\
                        + ad_description.text.strip()+'</p>'\
                        + '<p>Price: '+ad_price\
                        +'</p><p><img alt="Main Image" src="'\
                        + ad_image\
                        + '"/></p><hr noshade width="50%" align=\
                        "center"></body></html>'

            if message:
                subject = 'Leboncoin:'
                print("https://smsapi.free-mobile.fr/sendmsg?user=11117653&pass=poUbj5QQgU9Iwm&msg=%s" % subject + message)
                requests.get("https://smsapi.free-mobile.fr/sendmsg?user=11117653&pass=poUbj5QQgU9Iwm&msg=%s" % subject + message)


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='\
                        Check on Leboncoin.fr for new ads.')
    parser.add_argument('--objects',
                        help='objects to look after seperated by "+"',
                        required=False)
    args = parser.parse_args()

    Leboncoin().start()