![](https://wsofter.ru/wp-content/uploads/2021/07/mobile-logo-selenium-test-automation-software-testing-computer-selenium-png-512_512.jpg)
Today we will look at an example of parsing posts from social systems in which a keyword is mentioned. For this we will use the service www.social-searcher.com.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
#from fake_useragent import UserAgent
import os
import platform
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
from selenium.webdriver.common.proxy import Proxy, ProxyType
plt = platform.system()
driver_path = None
browser_path = None
if plt == "Windows":
#http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Win_x64/902856/
driver_path = '../bin/windows/chromedriver_win32/chromedriver.exe'
browser_path = '../bin/windows/chrome-win/chrome.exe'
print(f"Your OS {plt}")
elif plt == "Linux":
#http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Linux_x64/902856/
driver_path = '../bin/linux/chromedriver_linux64/chromedriver'
browser_path = '../bin/linux/chrome-linux/chrome'
print(f"Your OS {plt}")
elif plt == "Darwin":
#http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Mac/902856/
driver_path = '../bin/linux/chromedriver_mac64/chromedriver'
browser_path = '../bin/linux/chrome-mac/chrome'
print(f"Your OS {plt}")
else:
print(f"Unidentified {plt}")
options = webdriver.ChromeOptions()
#
#ua = UserAgent()
#userAgent = ua.random
#options.add_argument(f'user-agent={userAgent}')
#
#options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
options.add_argument('--test-type')
#options.add_argument('--single-process')
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
options.add_experimental_option("excludeSwitches", ["enable-logging"])
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.binary_location = browser_path
browser = webdriver.Chrome(executable_path = driver_path, options=options)
#browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (Linux; Android 8.1.0; Pixel Build/OPM4.171019.021.D1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.109 Mobile Safari/537.36 EdgA/42.0.0.2057", "platform":"Windows"})
#browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", "platform":"Windows"})
browser.set_window_size(1920, 1080)
#browser.set_window_size(1024, 1024)
browser.maximize_window()
baseurls = ["https://www.social-searcher.com"]
methods = ['social-buzz', 'google-social-search']
sites = ['fb', 'tw', 'in']
keyword = "העבודה"
page_to = 10
params = '?q=' + keyword + ''.join(['&' + str(p) + '=on' for p in sites])
url = f'{baseurls[0]}/{methods[1]}/' + params
print(url)
browser.get(url)
time.sleep(2)
browser.save_screenshot("./screenshot.png")
posts = []
def parse_list(browser, social_name, page_num = 1):
print(f"Parsing {social_name} page #{page_num}")
html = browser.page_source
bs = BeautifulSoup(html, 'lxml')
cards_html = bs.find('div', {'class': 'gsc-expansionArea'})
cards_array = cards_html.find_all('div', {'class': 'gsc-webResult'})
for card in cards_array:
title = card.find('a', {'class': 'gs-title'}).text
text = card.find('div', {'class': 'gs-snippet'}).text
url = card.find('div', {'class': 'gs-visibleUrl-long'}).text
pic=None
try:
pic = card.find('img', {'class': 'gs-image'})['src']
except:
#print("-")
pass
post = {title: title, text: text, url: url, pic: pic}
posts.append(post)
time.sleep(3)
browser.save_screenshot("./screenshot.png")
iframes = browser.find_elements_by_class_name('iframe-column')
for social in browser.find_elements_by_class_name('iframe-column'):
social_name = social.find_element_by_class_name('network-name').text
social_iframe = social.find_element_by_tag_name('iframe')
browser.switch_to.frame(social_iframe)
#
parse_list(browser, social_name, page_num = 1)
for page_num in range(2, page_to + 1, 1):
page_link = browser.find_element_by_xpath(f"//div[@aria-label='Page {page_num}']")
browser.execute_script("arguments[0].click();", page_link)
parse_list(browser, social_name, page_num)
#
browser.switch_to.default_content()
print(posts)
browser.quit()
The service offers paid and shareware usage rates. We need a free one. The relevant links are:
- https://www.social-searcher.com/google-social-search -free
- https://www.social-searcher.com/social-buzz — shareare with limit 100 request
![](https://wsofter.com/wp-content/uploads/2021/07/screen_social-1024x503.png)
What does the script do? The script opens a page by a link, inserts a keyword, ticks the boxes to search for the desired social networks and makes parsing page by page in turn
![](https://wsofter.com/wp-content/uploads/2021/07/parsing_result_in_cmd.png)
Below is an advanced parsing code with autoiteration of keywords
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
#from fake_useragent import UserAgent
import datetime
import os
import platform
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
from selenium.webdriver.common.proxy import Proxy, ProxyType
plt = platform.system()
driver_path = None
browser_path = None
if plt == "Windows":
#You must download here http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Win_x64/902856/
driver_path = '../bin/windows/chromedriver_win32/chromedriver.exe'
browser_path = '../bin/windows/chrome-win/chrome.exe'
print(f"Your OS {plt}")
elif plt == "Linux":
#You must download here http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Linux_x64/902856/
driver_path = '../bin/linux/chromedriver_linux64/chromedriver'
browser_path = '../bin/linux/chrome-linux/chrome'
print(f"Your OS {plt}")
elif plt == "Darwin":
#You must download here http://commondatastorage.googleapis.com/chromium-browser-snapshots/index.html?prefix=Mac/902856/
driver_path = '../bin/linux/chromedriver_mac64/chromedriver'
browser_path = '../bin/linux/chrome-mac/chrome'
print(f"Your OS {plt}")
else:
print(f"Unidentified {plt}")
options = webdriver.ChromeOptions()
#
#ua = UserAgent()
#userAgent = ua.random
#options.add_argument(f'user-agent={userAgent}')
#
#options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-extensions')
options.add_argument('--test-type')
#options.add_argument('--single-process')
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
options.add_experimental_option("excludeSwitches", ["enable-logging"])
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.binary_location = browser_path
browser = webdriver.Chrome(executable_path = driver_path, options=options)
#browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (Linux; Android 8.1.0; Pixel Build/OPM4.171019.021.D1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.109 Mobile Safari/537.36 EdgA/42.0.0.2057", "platform":"Windows"})
#browser.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36", "platform":"Windows"})
browser.set_window_size(1920, 1080)
browser.maximize_window()
parsing_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
baseurls = ["https://www.social-searcher.com"] #Service for parsing
methods = ['google-social-search', 'social-buzz'] #Subservices/plans for parsing
sites = ['fb', 'tw', 'in'] #Social networks for parsing
keywords = ["מפלגת העבודה", "שרת התחבורה", "מיכאלי", "מירב מיכאלי"] #Keywords
page_to = 10 #Total pages for parsing(max=10)
posts = [] #List for saving parsed posts
def parse_list(browser, social_name, keyword, page_num = 1):
#Print current page info
print(f"Parsing {social_name} with keyword @{keyword} on page #{page_num}")
html = browser.page_source
bs = BeautifulSoup(html, 'lxml')
cards_html = bs.find('div', {'class': 'gsc-expansionArea'})
cards_array = cards_html.find_all('div', {'class': 'gsc-webResult'})
#Iterate by post list on current page
for card in cards_array:
name = card.find('a', {'class': 'gs-title'}).text
text = card.find('div', {'class': 'gs-snippet'}).text
url = card.find('div', {'class': 'gs-visibleUrl-long'}).text
date = None
pic=None
try:
pic = card.find('img', {'class': 'gs-image'})['src']
except:
pass
#Creating post info set
post = {
'date_of_add': parsing_date,
'post': url,
'pic': pic,
'name': name,
'date': date,
'text': text,
'keyword': keyword,
'sentiment': None,
'social_name': social_name
}
posts.append(post)
time.sleep(3)
browser.save_screenshot("./screenshot.png")
#Iterating by keyords
for keyword in keywords:
#Generate URL for parsing with keyword and social params
params = '?q=' + keyword + ''.join(['&' + str(p) + '=on' for p in sites])
url = f'{baseurls[0]}/{methods[0]}/' + params
print(url)
#Locate by generated URL
browser.get(url)
time.sleep(2)
browser.save_screenshot("./screenshot.png")
#Iterate by social data by him iframe columns
for social in browser.find_elements_by_class_name('iframe-column'):
social_name = social.find_element_by_class_name('network-name').text
social_iframe = social.find_element_by_tag_name('iframe')
browser.switch_to.frame(social_iframe)
#Parsing the first page
parse_list(browser, social_name, keyword, page_num = 1)
#Parsing other pages
for page_num in range(2, page_to + 1, 1):
page_link = browser.find_element_by_xpath(f"//div[@aria-label='Page {page_num}']")
browser.execute_script("arguments[0].click();", page_link)
parse_list(browser, social_name, keyword, page_num)
#Switch browser content from iframe to main frame
browser.switch_to.default_content()
print(posts)
browser.quit()