J'avais codé pour un amis un truc similaire. Voici le code si ca peut t'aider.
Code : Tout sélectionner
EnableExplicit
Enumeration
#WinMain
#EditorResult
#StringURL
#BtnAnalyze
EndEnumeration
Structure TagCount
name.s
count.i
EndStructure
Global NewList Counts.TagCount()
;------------------------------------------
; Outils texte
;------------------------------------------
Procedure.s StripTags(Text.s)
Protected rx, result.s = Text
rx = CreateRegularExpression(#PB_Any, "<[^>]+>", #PB_RegularExpression_NoCase | #PB_RegularExpression_DotAll)
If rx
result = ReplaceRegularExpression(rx, result, "")
FreeRegularExpression(rx)
EndIf
result = ReplaceString(result, " ", " ")
result = ReplaceString(result, "&", "&")
result = ReplaceString(result, """, Chr(34))
result = ReplaceString(result, "<", "<")
result = ReplaceString(result, ">", ">")
; nettoyage simple des espaces
While FindString(result, " ")
result = ReplaceString(result, " ", " ")
Wend
ProcedureReturn Trim(result)
EndProcedure
Procedure.s DownloadHTML(URL.s)
Protected *Buffer, html.s, size
*Buffer = ReceiveHTTPMemory(URL)
If *Buffer = 0
ProcedureReturn ""
EndIf
size = MemorySize(*Buffer)
If size > 0
html = PeekS(*Buffer, size, #PB_UTF8 | #PB_ByteLength)
EndIf
FreeMemory(*Buffer)
ProcedureReturn html
EndProcedure
Procedure.i CountMatches(Text.s, Pattern.s)
Protected rx, n = 0
rx = CreateRegularExpression(#PB_Any, Pattern, #PB_RegularExpression_NoCase | #PB_RegularExpression_DotAll)
If rx
If ExamineRegularExpression(rx, Text)
While NextRegularExpressionMatch(rx)
n + 1
Wend
EndIf
FreeRegularExpression(rx)
EndIf
ProcedureReturn n
EndProcedure
Procedure.s ExtractFirstGroup(Text.s, Pattern.s)
Protected rx, result.s = ""
rx = CreateRegularExpression(#PB_Any, Pattern, #PB_RegularExpression_NoCase | #PB_RegularExpression_DotAll)
If rx
If ExamineRegularExpression(rx, Text)
If NextRegularExpressionMatch(rx)
result = RegularExpressionGroup(rx, 1)
EndIf
EndIf
FreeRegularExpression(rx)
EndIf
ProcedureReturn result
EndProcedure
Procedure.s ExtractHeadings(Text.s)
Protected rx, result.s = "", level.s, content.s
rx = CreateRegularExpression(#PB_Any, "<h([1-6])[^>]*>(.*?)</h\1>", #PB_RegularExpression_NoCase | #PB_RegularExpression_DotAll)
If rx
If ExamineRegularExpression(rx, Text)
While NextRegularExpressionMatch(rx)
level = RegularExpressionGroup(rx, 1)
content = StripTags(RegularExpressionGroup(rx, 2))
If content <> ""
result + "H" + level + " : " + content + #CRLF$
EndIf
Wend
EndIf
FreeRegularExpression(rx)
EndIf
If result = ""
result = "(aucun titre H1-H6 trouvé)" + #CRLF$
EndIf
ProcedureReturn result
EndProcedure
Procedure BuildCounts(Text.s)
ClearList(Counts())
AddElement(Counts()) : Counts()\name = "div" : Counts()\count = CountMatches(Text, "<div\b")
AddElement(Counts()) : Counts()\name = "section" : Counts()\count = CountMatches(Text, "<section\b")
AddElement(Counts()) : Counts()\name = "article" : Counts()\count = CountMatches(Text, "<article\b")
AddElement(Counts()) : Counts()\name = "header" : Counts()\count = CountMatches(Text, "<header\b")
AddElement(Counts()) : Counts()\name = "footer" : Counts()\count = CountMatches(Text, "<footer\b")
AddElement(Counts()) : Counts()\name = "nav" : Counts()\count = CountMatches(Text, "<nav\b")
AddElement(Counts()) : Counts()\name = "main" : Counts()\count = CountMatches(Text, "<main\b")
AddElement(Counts()) : Counts()\name = "p" : Counts()\count = CountMatches(Text, "<p\b")
AddElement(Counts()) : Counts()\name = "a" : Counts()\count = CountMatches(Text, "<a\b")
AddElement(Counts()) : Counts()\name = "img" : Counts()\count = CountMatches(Text, "<img\b")
AddElement(Counts()) : Counts()\name = "ul" : Counts()\count = CountMatches(Text, "<ul\b")
AddElement(Counts()) : Counts()\name = "li" : Counts()\count = CountMatches(Text, "<li\b")
AddElement(Counts()) : Counts()\name = "table" : Counts()\count = CountMatches(Text, "<table\b")
AddElement(Counts()) : Counts()\name = "form" : Counts()\count = CountMatches(Text, "<form\b")
EndProcedure
Procedure.s GetCountsReport()
Protected report.s = "", totalHeadings
ForEach Counts()
report + RSet(Counts()\name, 10, " ") + " : " + Str(Counts()\count) + #CRLF$
Next
totalHeadings = CountMatches(GetGadgetText(#EditorResult), "H[1-6]\s:")
ProcedureReturn report
EndProcedure
Procedure.s AnalyzeHTML(URL.s)
Protected html.s, title.s, headings.s, report.s, metaDesc.s
html = DownloadHTML(URL)
If html = ""
ProcedureReturn "Erreur : impossible de télécharger la page."
EndIf
title = StripTags(ExtractFirstGroup(html, "<title[^>]*>(.*?)</title>"))
If title = ""
title = "(aucun title trouvé)"
EndIf
metaDesc = ExtractFirstGroup(html, "<meta[^>]+name\s*=\s*[" + Chr(34) + "']description[" + Chr(34) + "'][^>]+content\s*=\s*[" + Chr(34) + "'](.*?)[" + Chr(34) + "']")
If metaDesc = ""
; variante si content arrive avant name
metaDesc = ExtractFirstGroup(html, "<meta[^>]+content\s*=\s*[" + Chr(34) + "'](.*?)[" + Chr(34) + "'][^>]+name\s*=\s*[" + Chr(34) + "']description[" + Chr(34) + "']")
EndIf
headings = ExtractHeadings(html)
BuildCounts(html)
report = "Analyse de la page : " + URL + #CRLF$ + #CRLF$
report + "TITLE" + #CRLF$
report + "-----" + #CRLF$
report + title + #CRLF$ + #CRLF$
report + "META DESCRIPTION" + #CRLF$
report + "----------------" + #CRLF$
If metaDesc <> ""
report + StripTags(metaDesc) + #CRLF$ + #CRLF$
Else
report + "(aucune meta description trouvée)" + #CRLF$ + #CRLF$
EndIf
report + "TITRES H1 à H6" + #CRLF$
report + "-------------" + #CRLF$
report + headings + #CRLF$
report + "STRUCTURE HTML" + #CRLF$
report + "--------------" + #CRLF$
report + GetCountsReport() + #CRLF$
report + "TAILLE HTML" + #CRLF$
report + "----------" + #CRLF$
report + Str(StringByteLength(html, #PB_UTF8)) + " octets environ" + #CRLF$
ProcedureReturn report
EndProcedure
;------------------------------------------
; Interface
;------------------------------------------
If OpenWindow(#WinMain, 0, 0, 900, 650, "Analyseur HTML", #PB_Window_SystemMenu | #PB_Window_ScreenCentered)
StringGadget(#StringURL, 10, 10, 700, 28, "https://example.com")
ButtonGadget(#BtnAnalyze, 720, 10, 170, 28, "Analyser la page")
EditorGadget(#EditorResult, 10, 50, 880, 590)
SetGadgetText(#EditorResult, "Entrez une URL puis cliquez sur 'Analyser la page'.")
Define url.s
Repeat
Select WaitWindowEvent()
Case #PB_Event_CloseWindow
Break
Case #PB_Event_Gadget
Select EventGadget()
Case #BtnAnalyze
url = Trim(GetGadgetText(#StringURL))
If url <> ""
SetGadgetText(#EditorResult, "Téléchargement et analyse en cours..." + #CRLF$)
SetGadgetText(#EditorResult, AnalyzeHTML(url))
Else
SetGadgetText(#EditorResult, "Veuillez saisir une URL valide.")
EndIf
EndSelect
EndSelect
ForEver
EndIf