Urlopen e BeautifulSoup

image_pdfimage_print

Exemplo simples para inicio rápido no Windows e Linux (Python 2.7 e Python 3)

Windows

from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://www.google.com.br")
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())

Ubuntu / Debian

Ex1:

import urllib2 
from bs4 import BeautifulSoup
html = urllib2.urlopen("http://www.google.com.br")
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())

Ex2:

import urllib2 
from bs4 import BeautifulSoup
import re
html = urllib2.urlopen("http://alerjln1.alerj.rj.gov.br/taqalerj.nsf/AnoInt?OpenForm&Start=1&Count=1000000&ExpandView")
soup = BeautifulSoup(html, 'html.parser')


# Start: 1 a 7 (paginas que compoem a lista) = total
cont = 0
for l in soup.findAll("a", href=re.compile("(OpenDocument)")):
	cont = cont + 1
	#print(l)
	output = "http://alerjln1.alerj.rj.gov.br/" + l.attrs['href'] + "&ExpandSection=1#_Section1"
	#print(output)
	print("%s - [%s]") % (output, cont)
	print(" \n \n")

Ex3:

import urllib2 ,cookielib
from bs4 import BeautifulSoup
import re


site= "http://www.ebook777.com/?s=codeigniter"
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

req = urllib2.Request(site, headers=hdr)
print(req)
#soup = BeautifulSoup(req, 'html.parser')
# 1 a 7
try:
    page = urllib2.urlopen(req)
except urllib2.HTTPError, e:
    print e.fp.read()

content = page.read()

soup = BeautifulSoup(content, 'html.parser')
#print(soup.prettify())
cont = 0
for l in soup.findAll("a", href=re.compile("(book)")):
	cont = cont + 1
	#print(l)
	output = "http://" + l.attrs['href'] + ""
	#print(output)
	print("%s - [%s]") % (output, cont)
	print(" \n \n")
Marcado como: BeautifulSoup, python, urlopen

Deixe uma resposta

O seu endereço de e-mail não será publicado. Campos obrigatórios são marcados com *